Gebruiker:OlafJanssen/KBserviceMappings
Latest update: 21 December 2023 Author: Olaf Janssen, Wikimedia coordinator @KB, national library of the Netherlands """
import json import os import re import overall
- Retrieve the services dictionary, and subdicts --> @VERA: zie helemaal onderaan deze pagina voor deze json
with open('servicesnames_dict.json', 'r') as file:
data = file.read()
servicesnames_dict = json.loads(data) kb_services = servicesnames_dict.get('kb_services') ob_services = servicesnames_dict.get('ob_services')
resolverprefix = "https?:\\/\\/resolver\\.kb\\.nl/resolve\\?urn=" # Ensure it's properly escaped for regex purposes
"""
Check if all keys in "servicesnames_dict.json" are used in the function/def names in Blocks 1 and 2 below, and v.v. Example: servicesnames_dict.json has a key named 'bibliopolis' , which is include in the function name (in Block 1) 'def is_bibliopolis_url(url)' --> this is OK!
""" def check_functionnames_against_dictkeys(dict):
""" Checks if parts of function names in the current script match with third-order keys in a provided dictionary. This function finds all function names in the current script that follow a specific naming pattern (e.g., is_X_url). It then checks if the 'X' part of these function names exists as a third-order key in the provided dictionary. Parameters: - dict (dict): A nested dictionary to search for third-order keys. Output: - The function prints messages indicating whether the part of each function name is found as a third-order key in the dictionary. """ filepath = os.path.abspath(__file__) # Path of this .py file function_names = overall.get_function_names(filepath) # Get function names from the current file defsyntax = r'^is_(.*?)_url$' # The pattern you want to match, ie "is_XXXXX_url" keys = overall.find_order_keys(dict,3) # Retrieve third-order keys from the dictionary
for name in function_names: if overall.matches_pattern(name, defsyntax): # Only function names of syntax "is_XXXXX_url" function_name_id = name.split("_")[1] # Extract 'XXXXX' from 'is_XXXXX_url' if function_name_id in keys: print(f"OK OK OK OK !! The function name id '{function_name_id}' in the function name '{name}' is reflected as " f"a 3rd order key in the provided dictionary.") else: print(f"Warning warning warning!! The function name id '{function_name_id}' in the function name '{name}' " f"is not reflected as a 3rd order key in the provided dictionary.") else: print(f"Warning warning warning!! Function name '{name}' does not follow the required syntax '{defsyntax}'.")
- Only execute 'check_functionnames_against_dictkeys()' if this .py module is run directly.
- If this module is imported (via an import statement), the function 'checkFunctionnamesAgainstDictkeys()' is not executed.
if __name__ == "__main__":
check_functionnames_against_dictkeys(servicesnames_dict)
def merge_dicts(kb_dict, ob_dict):
""" Merges multiple service dictionaries into one. Args: kb_dict (dict): Dictionary containing various KB service categories. ob_dict (dict): Dictionary containing various OB service categories. Returns: dict: A single dictionary containing all merged services. Note: If the same key is present in multiple dictionaries, the value from the last dictionary will be used. """ merged_services = {} # Define all categories to be merged kb_keys = overall.find_order_keys(kb_services,2) # ['kbsite', 'delpher', 'dbnl', 'databibnl', 'small', 'legacy'] etc ob_keys = overall.find_order_keys(ob_services,2) # ['main', 'pro', 'defunct'] etc # Merge KB services for key in kb_keys: merged_services.update(kb_dict.get(key, {})) # Merge OB services for key in ob_keys: merged_services.update(ob_dict.get(key, {})) return merged_services
def determine_service(url):
""" Determines the Koninklijke Bibliotheek (KB) service associated with a given URL. This function checks the URL against various KB and OB (Openbare Bibliotheken) services. It iterates through pre-defined service patterns to find a match. If a match is found, the function returns the name of the corresponding KB or OB service. The service include KB's main website, Delpher, Legacy KB services, other independent KB services, and OB services. For certain service categories, specific patterns are checked through designated functions. Args: url (str): The URL to be checked against KB and OB service patterns. Returns: str: The name of the matched KB or OB service if a match is found. If no match is found, it returns an empty string. If the URL is invalid or empty, it returns an error message indicating the issue. Note: The function relies on a series of 'is_<service_key>_url' functions defined globally. These functions are responsible for checking if a given URL matches a specific KB or OB service pattern. If such a function does not exist for a given category, the category is skipped in the matching process. """ if not isinstance(url, str) or not url: return 'Invalid or empty URL provided'
# Combine all service categories into one dict for streamlined processing all_services = merge_dicts(kb_services, ob_services)
# Iterate over each service category for service_key, service_name in all_services.items(): # Skip the first key for certain services # TODO: Check if the next 2 lines are still necessary if service_key in ['kbsite' ,'delpher', 'legacy', 'small']: continue function_name = f'is_{service_key}_url' check_function = globals().get(function_name)
# Call the function with the URL if check_function and check_function(url): return service_name # Return empty string if no service matches return
def determine_category(service):
""" Determines the category of a given service based on a combined mapping of KB and OB services. This function looks up a provided service name in a combined dictionary of KB and OB services. If the service is found in the dictionary, it returns the corresponding category name. If the service is not found, it returns the original service name. Args: service (str): The service name to be categorized. Returns: str: The category name if the service is found in the mappings; otherwise, the original service name. Note: This function relies on a globally accessible `servicesnames_dict` that contains 'kb_services' and 'ob_services' dictionaries. Each of these dictionaries should have categories as keys and a dictionary of services as values. """ # Combine kb_services and ob_services mappings into 1 dict service_mappings = {**kb_services, **ob_services}
for servicekey, services in service_mappings.items(): if service in services.values(): return services.get(servicekey) return service
- Long list of individual URL pattern matching functions
- def name syntax should always be "is_XXXXX_url(url)"
- BLOCK 1: For 'classical' KB services #########################
- KB.nl
def is_kbmain_url(url):
""" Checks if the URL matches the pattern for the KB main website (www.kb.nl) These can be * Starting with http(s)://(www\.)kb.nl * Starting with http(s)://galerij.kb.nl * Starting with http(s)://blog.kb.nl But excluding URLs that start with: * http(s)://(kranten|boeken|boeken1|tijdschriften|anp|poortman|resolver|opc4|jsru|sru|watermark|bltvn|ibl|collecties|manuscripts).kb.nl * http(s)://XXXXX.authkb.kb.nl/ * http(s)://(www.)kb.nl/bc/koopman * http(s)://(www.)kb.nl/themas/ * http(s)://(www.)kb.nl/en/themes/ Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern1 = r'https?://(www\.)?kb\.nl/(bc/koopman|themas|en/themes)' """ Pattern1 matches URLs that start with 'http://' or 'https://', optionally followed by 'www.', and then 'kb.nl/', followed by either 'bc/koopman', 'themas', or 'en/themes'. """ pattern2 = r'https?://(kranten|boeken1?|tijdschriften|anp|poortman|resolver|opc4|jsru|sru|watermark|collecties|manuscripts)\.kb\.nl' """ This pattern will match URLs like 'http://kranten.kb.nl', 'https://boeken.kb.nl', 'http://boeken1.kb.nl', 'https://tijdschriften.kb.nl', etc. """ pattern3 = r'https?://.*\.authkb\.kb\.nl/' """ This pattern should match URLs that start with 'http://' or 'https://', followed by any string, and then end with '.authkb.kb.nl/'. For example, it will match 'http://www.oxfordmusiconline.com.access.authkb.kb.nl/' and 'https://doi-org.access.authkb.kb.nl/', as well as any other URL that fits this pattern. """ pattern4 = r'https?://(www\.|galerij\.|blog\.)?kb\.nl/' """ Pattern 4 matches URLs starting with either 'http://' or 'https://', optionally followed by 'www.', 'galerij.', or 'blog.', and then 'kb.nl/' """ if re.match(pattern1, url): return False elif re.match(pattern2, url): return False elif re.match(pattern3, url): return False elif re.match(pattern4, url): return True else: return False
def is_kbcollections_url(url):
""" Checks if the URL matches the pattern for the KB collections website (collecties.kb.nl) These can be Starting with * http(s)://collecties.kb.nl/ * http(s)://(www.)kb.nl/themas/ * http(s)://(www.)kb.nl/en/themes/ * http(s)://(www.)kb.nl/bc/koopman But excluding URLs that start with: * None Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ # Pattern made by ChatGPT pattern = r'https?://((www\.)?kb\.nl/(bc/koopman|themas/|en/themes/)|collecties\.kb\.nl/)' """ This pattern matches URLs that start with either 'http://' or 'https://', followed by one of the following: '(www.)kb.nl/themas/', '(www.)kb.nl/bc/koopman/', 'kb.nl/themas/', 'www.kb.nl/en/themes/', or 'kb.nl/en/themes/' or 'collecties.kb.nl/'. """ return re.match(pattern, url) is not None
- Delpher
def is_delpher_static_url(url):
""" Checks if the URL matches the pattern for 'Delpher static'. These can be * Exactly http(s)://(www.)delpher.nl(/nl)(/) --> pattern1 * Starting with http(s)://(www.)delpher.nl/platform|over-delpher|thema) --> pattern2 * Starting with http(s)://(www.)delpher.nl/nl/(platform|over-delpher|thema) --> pattern2 But excluding URLs that start with: * None Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern1 = r'^https?://(www\.)?delpher\.nl(/nl)?/?$'
""" This regular expression pattern1 matches URLs that start with either 'http://' or 'https://', optionally followed by 'www.', followed by 'delpher.nl', and optionally ending with a single slash. It does not match any other characters or paths beyond the domain name. Examples of URLs it would match include: http://delpher.nl, https://delpher.nl, http://www.delpher.nl, https://www.delpher.nl, http://delpher.nl/, https://delpher.nl/ """
pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?(platform|over-delpher|thema)' """ This regular expression pattern2 matches URLs that start with either 'http://' or 'https://', optionally followed by 'www.', followed by 'delpher.nl/', optionally followed by 'nl/', and then immediately followed by either 'platform', 'over-delpher', or 'thema'. It specifically matches URLs that navigate to these three sections of the 'delpher.nl' website, with or without the 'nl/' path segment. Examples of URLs it would match include: http://delpher.nl/platform, https://delpher.nl/over-delpher, http://www.delpher.nl/thema, https://www.delpher.nl/nl/platform, http://delpher.nl/nl/over-delpher, https://delpher.nl/nl/thema It would not match URLs that do not have 'platform', 'over-delpher', or 'thema' immediately following 'delpher.nl/' or 'delpher.nl/nl/', or URLs with additional paths or characters beyond these specific terms. """
if re.match(pattern1, url): return True elif re.match(pattern2, url): return True else: return False
def is_delpher_newspapers_basic_url(url):
""" Checks if the URL matches the pattern for 'Delpher Kranten 1618-1995 (Basiscollectie)'. These can be * Statice patterns - Starting with http(s)://kranten.(kb|delpher).nl(/) - Starting with http(s)://(www.)delpher.nl/kranten(/) - Starting with http(s)://(www.)delpher.nl/nl/kranten(/) * Resolver newspapers URL patterns - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherNewspapersResolverIDs}, where DelpherNewspapersResolverIDs = ['ddd'.......MMWFA01'] (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=DDD_krantnr&x-facets=facets:mdoSet)
Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ # Static patterns pattern1 = r'^https?://kranten.(kb|delpher)\.nl/?' pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?kranten/?'
DelpherNewspapersResolverIDs = ['ddd', 'ABCDDD', 'KBDDD02', 'KBNRC01', 'KBPERS01', 'MMCC01', 'MMCC02', 'MMCODA01','MMCODA02', 'MMDA03', 'MMECAL02', 'MMGARO01', 'MMGASL01', 'MMGAVL01', 'MMGAZS01', 'MMGEM01', 'MMHCO01', 'MMHCO02', 'MMIISG05', 'MMIISG07','MMIISG18', 'MMKB04', 'MMKB08', 'MMKB12', 'MMKB15', 'MMKB19', 'MMKB23', 'MMKB27', 'MMKB32', 'MMMAAS01', 'MMMHW01', 'MMNHA02', 'MMNHA03', 'MMNIOD04', 'MMNIOD05', 'MMPM05', 'MMRANM02', 'MMRANM03', 'MMRANMG01', 'MMRAZ02', 'MMRAZ03', 'MMRHCE01', 'MMRHCE02', 'MMRHCG03', 'MMSAA06', 'MMSAB03', 'MMSAB04', 'MMSADB01','MMSAEN01', 'MMSAEN02', 'MMSAK01', 'MMSARO02', 'MMSAVL02', 'MMSHCL03', 'MMTELE01', 'MMTRES02', 'MMTRES03','MMTRES04', 'MMUBTB04', 'MMUBWA01', 'MMVEEN01', 'MMVEEN02', 'MMWFA01'] if re.match(pattern1, url): return True elif re.match(pattern2, url): return True else: for id in DelpherNewspapersResolverIDs: pattern3 = rf'{resolverprefix}{id}' if re.match(pattern3, url): return True return False
def is_delpher_newspapers_external_url(url):
""" Checks if the URL matches the pattern for 'Externe regionale kranten'. These can be starting with * Static patterns: http(s)://(www.)delpher.nl/nl/regio Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern = r'^https?://(www\.)?delpher\.nl/(nl/)?regio/?' return re.match(pattern, url) is not None
def is_delpher_books_basic_url(url):
""" Checks if the URL matches the pattern for 'Boeken 17e t/m 20e eeuw (Basiscollectie)'. These can be * Statice patterns - Starting with http(s)://boeken.(kb|delpher).nl(/) - Starting with http(s)://(www.)delpher.nl/boeken/ --> compulsory trailing '/' to prevent matching with 'boeken1' - Starting with http(s)://(www.)delpher.nl/nl/boeken/ --> compulsory trailing '/' to prevent matching with 'boeken1' * Resolver boeken URL patterns - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherBooksResolverIDs}, where DelpherBooksResolverIDs = ['dpo', 'DEJONG', ...., 'SAB01'] (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=BOEKEN_boek&x-facets=facets:mdoSet)
Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ # Static patterns pattern1 = r'^https?://boeken.(kb|delpher)\.nl/?' pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?boeken/' DelpherBooksResolverIDs = ['dpo','DEJONG','GBR02','KBEU7475','KBITA01','KBTRES01','KONB10','KONB15','MMATR03','MMATR08', 'MMCMC01','MMKB02','MMKB02A','MMKB02B','MMKB05','MMKB06','MMKB11','MMKB18','MMKB18A','MMKB18B', 'MMKB18C','MMKB18D','MMKB21','MMKB22','MMKB24','MMKB25','MMKB28','MMKB31','MMKIT03','MMMVC01', 'MMNIOD07','MMSFKB02','MMSFUBA02','MMSFUBU02','MMTSGG01','MMTUA01','MMTUK01','MMUBA08','MMUBA09', 'MMUBL07','MMUBVU02','MMUBVU05','NIOD02','NIOD05','NOM01','PRB01','SAB01'] if re.match(pattern1, url): return True elif re.match(pattern2, url): return True else: for id in DelpherBooksResolverIDs: pattern3 = rf'{resolverprefix}{id}' if re.match(pattern3, url): return True return False
def is_delpher_books_google_url(url):
""" Checks if the URL matches the pattern for 'Boeken Google'. These can be * Statice patterns - Starting with http(s)://boeken1.(kb|delpher).nl(/) - Starting with http(s)://(www.)delpher.nl/boeken1(/) - Starting with http(s)://(www.)delpher.nl/nl/boeken1(/) Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern1 = r'^https?://boeken1.(kb|delpher)\.nl/?' pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?boeken1/?' if re.match(pattern1, url): return True elif re.match(pattern2, url): return True else: return False
def is_delpher_magazines_url(url):
""" Checks if the URL matches the pattern for 'Tijdschriften 19e en 20e eeuw'. These can be * Statice patterns - Starting with http(s)://tijdschriften.(kb|delpher).nl(/) - Starting with http(s)://(www.)delpher.nl/tijdschriften(/) - Starting with http(s)://(www.)delpher.nl/nl/tijdschriften(/) * Resolver tijdschriften URL patterns - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherMagazinesResolverIDs}, where DelpherMagazinesResolverIDs = ['dts','CBS_EXT',... 'OBA01'] (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=DTS_document&x-facets=facets:mdoSet Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ # Static patterns pattern1 = r'^https?://tijdschriften.(kb|delpher)\.nl/?' pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?tijdschriften/?'
DelpherMagazinesResolverIDs = ['dts','CBS_EXT','GMDH01','KBDBNL02','KBDBVL02','KBDC001','KBKWG02','KBNA001','KBVRNL01','MMAD01', 'MMALET01','MMALET02','MMATR01','MMATR02','MMATR05','MMATR06','MMATR07','MMATR09','MMAVB01','MMBOY01', 'MMBPH01','MMBZK02','MMCBS01','MMCBS02','MMCBS03','MMCMC03','MMCMC05','MMCSS01','MMDA04','MMEGG01','MMENLK02', 'MMENLK03','MMENLK04','MMEUR03','MMEYE01','MMEYE02','MMGAMS01','MMGAMS02','MMGAZS02','MMHCC01','MMHFC01', 'MMHNI01','MMHNI03','MMIISG02','MMIISG06','MMIISG08','MMIISG10','MMIISG13','MMIISG15','MMIISG17','MMIISG20','MMIISG21', 'MMKB07','MMKB10','MMKB13','MMKB14','MMKB16','MMKB26','MMKB30','MMKB37','MMKDC03','MMKDC04','MMKDC05','MMKDC07','MMKDC09', 'MMKDC10','MMKIT04','MMKITLV3','MMKNAU01','MMKNGU01','MMKNMP01','MMKNMP02','MMKNRB01','MMKNTB01','MMKNVB01','MMKPSV01', 'MMKVLO01','MMKVLO02','MMKWG01','MMLIND01','MMMI02','MMNA12','MMNAT03','MMNDR01','MMNFM01','MMNHV01','MMNIBG01','MMNIOD08', 'MMNMM02','MMNPZ01','MMNVOG01','MMNVOG02','MMOBDH01','MMOHKT01','MMPM02','MMPM03','MMPM04','MMRAA02','MMRHCG01','MMRHCG02', 'MMRKD09','MMRMO02','MMSARO01','MMSAVL01','MMSHCL02','MMSPOM01','MMTEY01','MMTUK02','MMTUK03','MMUBA10','MMUBA13','MMUBA15', 'MMUBA16','MMUBL08','MMUBL09','MMUBMA01','MMUBMA02','MMUBTB01','MMUBTB02','MMUBTB03','MMUBU02','MMUBVU04','MMUBVU06','MMUBVU07', 'MMUBVU08','MMUBWA02','MMUBWA03','MMUBWA04','MMUBWA05','MMUTRA01','MMUTRA03','MMVRED01','MMZAH01','MMZAH03','MMZB04','MMZEND01', 'MMZEND02','MMZOU01','NIOD07','NIOD09','OBA01']
if re.match(pattern1, url): return True elif re.match(pattern2, url): return True else: for id in DelpherMagazinesResolverIDs: pattern3 = rf'{resolverprefix}{id}' if re.match(pattern3, url): return True return False
def is_delpher_radiobulletins_url(url):
""" Checks if the URL matches the pattern for 'Radiobulletins van het ANP'. These can be * Statice patterns - Starting with http(s)://(anp|radiobulletins).(kb|delpher).nl(/) - Starting with http(s)://(www.)delpher.nl/radiobulletins(/) - Starting with http(s)://(www.)delpher.nl/nl/radiobulletins(/) * Resolver radiobulletins URL patterns - Starting with http(s)://resolver.kb.nl/resolve?urn=anp Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern1 = r'^https?://(anp|radiobulletins).(kb|delpher)\.nl/?' pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?radiobulletins/?' pattern3 = rf'^{resolverprefix}anp:'
if re.match(pattern1, url): return True elif re.match(pattern2, url): return True elif re.match(pattern3, url): return True else: return False
def is_databibnl_url(url):
""" Checks if the URL starts with one of the specified data.bibliotheken.nl patterns. These can be * Starting with http(s)://data.bibliotheken.nl/ Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern = r'https?://data\.bibliotheken\.nl/?' return re.match(pattern, url) is not None
- Overige eigenstandige KB-diensten
def is_dbnl_url(url):
""" Checks if the URL starts with one of the specified DBNL patterns. These can be * Starting with http(s)://dbnl.nl(.org)/ * Starting with http(s)://*.dbnl.nl(.org)/ (with *. being non-empty) * Starting with http(s)://resolver.kb.nl/resolve?urn=dbnl Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern = rf'(https?://(.*\.)?dbnl\.(nl|org)|{resolverprefix}dbnl:)' return re.match(pattern, url) is not None
def is_gvn_url(url):
""" Checks if the URL starts with one of the specified Geheugen (van Nederland) (GvN) patterns. These can be * Starting with "http(s)://geheugen.delpher.nl" * Starting with "http(s)://(www.)geheugenvannederland.nl * Starting with "http(s)://resolver.kb.nl/resolve?urn=urn:gvn: Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ # pattern = r' ==> raw string # pattern = f' ==> formatted string pattern = rf'^(https?://geheugen\.delpher\.nl|https?://(www\.)?geheugenvannederland\.nl|{resolverprefix}urn:gvn:)'
""" Explanation of the pattern (explainer from ChatGPT): 1. `https?://geheugen\.delpher\.nl` - `https?://`: Matches 'http://' or 'https://'. - `geheugen\.delpher\.nl`: Matches 'geheugen.delpher.nl'. The backslashes `\` escape the dots, as dots are special characters in regular expressions. 2. `https?://(www\.)?geheugenvannederland\.nl` - `https?://`: Matches 'http://' or 'https://'. - `(www\.)?`: Optionally matches 'www.'. - `geheugenvannederland\.nl`: Matches 'geheugenvannederland.nl'. 3. `{resolverprefix}urn:gvn:` - `https?://`: Matches 'http://' or 'https://'. - `resolver\.kb\.nl/resolve\?urn=urn:gvn:`: Matches 'resolver.kb.nl/resolve?urn=urn:gvn:'. Here, the question mark `?` and the colon `:` are escaped because they have special meanings in regular expressions. The entire pattern is enclosed in parentheses `(` and `)` with the OR operator `|` used to separate the different URL formats. This regex will match any URL that starts with one of the three specified formats. """ return re.match(pattern, url) is not None
def is_kbcat_url(url):
""" Checks if the URL starts with one of the specified patterns for the KB catalogus. These can be * Starting with "http://opc4.kb.nl" * Starting with "http(s)://resolver.kb.nl/resolve?urn=PPN: * Starting with "https://opc-kb.oclc.org" * Starting with "https://webggc.oclc.org/cbs/" and optionally "DB=2.37" somewhere after that in the URL Args: - url (str): The URL to check. Returns: - bool: True if the URL matches one of the patterns, False otherwise. """ pattern = rf'^(https?://opc4\.kb\.nl|{resolverprefix}PPN:|https?://opc-kb\.oclc\.org|https?://webggc\.oclc\.org/cbs/.*(DB=2\.37)?.*$)' """ Explanation of the pattern (explainer from ChatGPT): http://opc4\.kb\.nl: Matches URLs starting with "http://opc4.kb.nl". https?://resolver\.kb\.nl/resolve\?urn=PPN:: Matches URLs starting with "http://resolver.kb.nl/resolve?urn=PPN:" or "https://resolver.kb.nl/resolve?urn=PPN:". https://opc-kb\.oclc\.org: Matches URLs starting with "https://opc-kb.oclc.org". https://webggc\.oclc\.org/cbs/.*DB=2\.37: Matches URLs starting with "https://webggc.oclc.org/cbs/" and containing "DB=2.37" anywhere in the URL. Each pattern is separated by the | operator, which acts as an OR in regular expressions. The ^ at the beginning ensures the pattern matches from the start of the URL. Note that . is escaped as \. because it's a special character in regular expressions, and .* is used to match any characters (including none) between "cbs" and "DB=2.37" in the last URL pattern. """ return re.match(pattern, url) is not None
def is_mmdc_url(url):
pattern = r'https?://(www\.)?mmdc\.nl/?' return re.match(pattern, url) is not None
def is_litges_url(url):
""" Checks if the URL starts with one of the specified literatuurgeschiedenis.nl or .org patterns. """ pattern = r'https?://(www\.)?literatuurgeschiedenis\.(nl|org)/?' return re.match(pattern, url) is not None
def is_jsru_url(url):
pattern = r'https?://j?sru\.kb\.nl/?' return re.match(pattern, url) is not None
def is_authkb_url(url):
pattern = r'https?://.*\.authkb\.kb\.nl/' return re.match(pattern, url) is not None
- Oude KB-diensten (legacy)
def is_bibliopolis_url(url):
pattern = r'https?://(www\.)?bibliopolis\.nl/?' return re.match(pattern, url) is not None
def is_dbng_url(url):
pattern = r'https?://(www\.)?dbng\.nl/?' return re.match(pattern, url) is not None
def is_mim_url(url):
pattern = r'https?://manuscripts\.kb\.nl/?' return re.match(pattern, url) is not None
def is_wilc_url(url):
pattern = r'https?://watermark\.kb\.nl/?' return re.match(pattern, url) is not None
def is_poortman_url(url):
pattern = r'https?://poortman\.kb\.nl/?' return re.match(pattern, url) is not None
def is_ibl_url(url):
pattern = r'https?://ibl\.kb\.nl/?' return re.match(pattern, url) is not None
- BLOCK 2:For OB services #########################
- Main public OB sites
def is_biebnl_url(url):
# http://leesplein.bibliotheek.nl/assets/..... # http://stichting.bibliotheek.nl/ # http://www.bibliotheek.nl/luisterboeken pattern = r'https?://(.*\.)?bibliotheek\.nl/?' return re.match(pattern, url) is not None
def is_onlinebieb_url(url):
#https://www.onlinebibliotheek.nl/e-books.html pattern = r'https?://(.*\.)?onlinebibliotheek\.nl/?' return re.match(pattern, url) is not None
def is_jeugdbieb_url(url):
#https://www.jeugdbibliotheek.nl/catalogus.catalogus.html?q=Coby%20Leeuwenburgh-Kloosterman #https://12-15.jeugdbibliotheek.nl/lezen/informatie-over-schrijvers/gideon-samson.html pattern = r'https?://(.*\.)?jeugdbibliotheek\.nl/?' return re.match(pattern, url) is not None
- Pro OB sites
def is_biebnetwerk_url(url):
# https://www.bibliotheeknetwerk.nl/artikel/bijzondere-bibliotheken pattern = r'https?://(.*\.)?bibliotheeknetwerk\.nl/?' return re.match(pattern, url) is not None
- Defunct OB sites, offline now, 404s
def is_digeta_url(url):
# http://www.digitaleetalages.nl/dam/sinterklaas/zwarte-piet/..... # https://www.digitaleetalages.nl/thema/amsterdam/175-jaar-artis/artis-en-de-functieverandering-van-dierentuinen.html pattern = r'https?://(.*\.)?digitaleetalages\.nl/?' return re.match(pattern, url) is not None
def is_litplein_url(url):
# http://www.literatuurplein.nl/persdetail.jsp?persId=644013 # http://literatuurplein.nl/ pattern = r'https?://(.*\.)?literatuurplein\.nl/?' return re.match(pattern, url) is not None
def is_leesplein_url(url):
# https://www.leesplein.nl/LL_plein.php?submenu=set_set&id=135 # http://www.leesplein.nl/LL_plein.php?submenu=set_set&id=7681 pattern = r'https?://(.*\.)?leesplein\.nl/?' return re.match(pattern, url) is not None
========= servicesnames_dict.json =============[bewerken | brontekst bewerken]
{
"kb_services": { "kbsite": { "kbsite": "KB.nl", "kbmain": "KB website", "kbcollections": "KB Collecties website" }, "delpher": { "delpher": "Delpher", "delpher_static": "Delpher statische paginas", "delpher_newspapers_basic": "Kranten 1618-1995 (Basiscollectie)", "delpher_newspapers_external": "Externe regionale kranten", "delpher_books_basic": "Boeken 17e t/m 20e eeuw (Basiscollectie)", "delpher_books_google": "Boeken Google", "delpher_magazines": "Tijdschriften 19e en 20e eeuw", "delpher_radiobulletins": "Radiobulletins van het ANP" }, "dbnl": { "dbnl": "DBNL" }, "databibnl": { "databibnl": "data.bibliotheken.nl" }, "small": { "small": "Kleine KB-diensten", "litges": "Literatuurgeschiedenis", "kbcat": "KB catalogus", "gvn": "Geheugen (van Nederland)", "mmdc": "MMDC", "jsru": "(j)SRU service", "authkb": "KB authentication service" }, "legacy": { "legacy": "Oude KB-diensten (legacy)", "bibliopolis": "Bibliopolis", "dbng": "Digitale Bibliografie Nederlandse Geschiedenis", "mim": "Middeleeuwse Verluchte Handschriften (MIM)", "wilc": "Watermarks in Incunabula printed in the Low Countries (WILC)", "poortman": "Wijsbegeerte in Nederland (Poortmans repertorium)", "ibl": "Artikelen uit tijdschriften (IBL)" } }, "ob_services": { "main": { "main": "Hoofddiensten OB", "biebnl": "Bibliotheek.nl", "onlinebieb": "Online bibliotheek", "jeugdbieb": "Jeugdbibliotheek" }, "pro": { "pro": "Diensten voor OB-professionals", "biebnetwerk": "Bnetwerk" }, "defunct": { "defunct": "Voormalige OB-diensten", "leesplein": "Leesplein", "litplein": "Literatuurplein", "digeta": "Digitale etalages" } }
}