Naar inhoud springen

Gebruiker:OlafJanssen/KBserviceMappings

Uit Wikipedia, de vrije encyclopedie

Latest update: 21 December 2023 Author: Olaf Janssen, Wikimedia coordinator @KB, national library of the Netherlands """

import json import os import re import overall

  1. Retrieve the services dictionary, and subdicts --> @VERA: zie helemaal onderaan deze pagina voor deze json

with open('servicesnames_dict.json', 'r') as file:

   data = file.read()

servicesnames_dict = json.loads(data) kb_services = servicesnames_dict.get('kb_services') ob_services = servicesnames_dict.get('ob_services')

resolverprefix = "https?:\\/\\/resolver\\.kb\\.nl/resolve\\?urn=" # Ensure it's properly escaped for regex purposes

  1. `https?://`: Matches 'http://' or 'https://'.

"""

Check if all keys in "servicesnames_dict.json" are used in the function/def names in Blocks 1 and 2 below, and v.v.
  Example: servicesnames_dict.json has a key named 'bibliopolis' , which is include in 
  the function name (in Block 1) 'def is_bibliopolis_url(url)' --> this is OK!

""" def check_functionnames_against_dictkeys(dict):

   """
   Checks if parts of function names in the current script match with third-order keys in a provided dictionary.
   This function finds all function names in the current script that follow a specific naming pattern (e.g., is_X_url).
   It then checks if the 'X' part of these function names exists as a third-order key in the provided dictionary.
   Parameters:
   - dict (dict): A nested dictionary to search for third-order keys.
   Output:
   - The function prints messages indicating whether the part of each function name is found as a third-order key
     in the dictionary.
   """
   filepath = os.path.abspath(__file__)  # Path of this .py file
   function_names = overall.get_function_names(filepath)  # Get function names from the current file
   defsyntax = r'^is_(.*?)_url$'  # The pattern you want to match, ie "is_XXXXX_url"
   keys = overall.find_order_keys(dict,3)  # Retrieve third-order keys from the dictionary
   for name in function_names:
       if overall.matches_pattern(name, defsyntax):  # Only function names of syntax "is_XXXXX_url"
           function_name_id = name.split("_")[1]  # Extract 'XXXXX' from 'is_XXXXX_url'
           if function_name_id in keys:
               print(f"OK OK OK OK !! The function name id '{function_name_id}' in the function name '{name}' is reflected as "
                     f"a 3rd order key in the provided dictionary.")
           else:
               print(f"Warning warning warning!! The function name id '{function_name_id}' in the function name '{name}' "
                     f"is not reflected as a 3rd order key in the provided dictionary.")
       else:
           print(f"Warning warning warning!! Function name '{name}' does not follow the required syntax '{defsyntax}'.")
  1. Only execute 'check_functionnames_against_dictkeys()' if this .py module is run directly.
  2. If this module is imported (via an import statement), the function 'checkFunctionnamesAgainstDictkeys()' is not executed.

if __name__ == "__main__":

   check_functionnames_against_dictkeys(servicesnames_dict)

def merge_dicts(kb_dict, ob_dict):

   """
   Merges multiple service dictionaries into one.
   Args:
       kb_dict (dict): Dictionary containing various KB service categories.
       ob_dict (dict): Dictionary containing various OB service categories.
   Returns:
       dict: A single dictionary containing all merged services.
   Note:
       If the same key is present in multiple dictionaries, the value from the last dictionary will be used.
   """
   merged_services = {}
   # Define all categories to be merged
   kb_keys = overall.find_order_keys(kb_services,2)  # ['kbsite', 'delpher', 'dbnl', 'databibnl', 'small', 'legacy'] etc
   ob_keys = overall.find_order_keys(ob_services,2)  # ['main', 'pro', 'defunct'] etc
   # Merge KB services
   for key in kb_keys:
       merged_services.update(kb_dict.get(key, {}))
   # Merge OB services
   for key in ob_keys:
      merged_services.update(ob_dict.get(key, {}))
   return merged_services

def determine_service(url):

   """
   Determines the Koninklijke Bibliotheek (KB) service associated with a given URL.
   This function checks the URL against various KB and OB (Openbare Bibliotheken) services. It iterates through
   pre-defined service patterns to find a match. If a match is found, the function returns the name of the corresponding
   KB or OB service. The service include KB's main website, Delpher, Legacy KB services, other independent KB
   services, and OB services. For certain service categories, specific patterns are checked through designated functions.
   Args:
       url (str): The URL to be checked against KB and OB service patterns.
   Returns:
       str: The name of the matched KB or OB service if a match is found. If no match is found, it returns an empty string.
            If the URL is invalid or empty, it returns an error message indicating the issue.
   Note:
       The function relies on a series of 'is_<service_key>_url' functions defined globally. These functions
       are responsible for checking if a given URL matches a specific KB or OB service pattern. If such a function
       does not exist for a given category, the category is skipped in the matching process.
   """
   if not isinstance(url, str) or not url:
       return 'Invalid or empty URL provided'
   # Combine all service categories into one dict for streamlined processing
   all_services = merge_dicts(kb_services, ob_services)
   # Iterate over each service category
   for service_key, service_name in all_services.items():
       # Skip the first key for certain services
       # TODO: Check if the next 2 lines are still necessary
       if service_key in ['kbsite' ,'delpher', 'legacy', 'small']:
           continue
       function_name = f'is_{service_key}_url'
       check_function = globals().get(function_name)
       # Call the function with the URL
       if check_function and check_function(url):
           return service_name
   # Return empty string if no service matches
   return 

def determine_category(service):

   """
   Determines the category of a given service based on a combined mapping of KB and OB services.
   This function looks up a provided service name in a combined dictionary of KB and OB services.
   If the service is found in the dictionary, it returns the corresponding category name.
   If the service is not found, it returns the original service name.
   Args:
       service (str): The service name to be categorized.
   Returns:
       str: The category name if the service is found in the mappings;
            otherwise, the original service name.
   Note:
       This function relies on a globally accessible `servicesnames_dict` that contains
       'kb_services' and 'ob_services' dictionaries. Each of these dictionaries should
       have categories as keys and a dictionary of services as values.
   """
   # Combine kb_services and ob_services mappings into 1 dict
   service_mappings = {**kb_services, **ob_services}
   for servicekey, services in service_mappings.items():
       if service in services.values():
           return services.get(servicekey)
   return service
                      1. Long list of individual URL pattern matching functions
    1. def name syntax should always be "is_XXXXX_url(url)"
                        1. BLOCK 1: For 'classical' KB services #########################
                      1. KB.nl

def is_kbmain_url(url):

   """
   Checks if the URL matches the pattern for the KB main website (www.kb.nl)
   These can be
   * Starting with http(s)://(www\.)kb.nl
   * Starting with http(s)://galerij.kb.nl
   * Starting with http(s)://blog.kb.nl
   But excluding URLs that start with:
   * http(s)://(kranten|boeken|boeken1|tijdschriften|anp|poortman|resolver|opc4|jsru|sru|watermark|bltvn|ibl|collecties|manuscripts).kb.nl
   * http(s)://XXXXX.authkb.kb.nl/
   * http(s)://(www.)kb.nl/bc/koopman
   * http(s)://(www.)kb.nl/themas/
   * http(s)://(www.)kb.nl/en/themes/
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern1 = r'https?://(www\.)?kb\.nl/(bc/koopman|themas|en/themes)'
   """
   Pattern1 matches URLs that start with 'http://' or 'https://', optionally followed by 'www.', and then 'kb.nl/', 
   followed by either 'bc/koopman', 'themas', or 'en/themes'.
   """
   pattern2 = r'https?://(kranten|boeken1?|tijdschriften|anp|poortman|resolver|opc4|jsru|sru|watermark|collecties|manuscripts)\.kb\.nl'
   """
   This pattern will match URLs like 'http://kranten.kb.nl', 'https://boeken.kb.nl', 'http://boeken1.kb.nl', 'https://tijdschriften.kb.nl', etc.
   """
   pattern3 = r'https?://.*\.authkb\.kb\.nl/'
   """
   This pattern should match URLs that start with 'http://' or 'https://', followed by any string, and then end with '.authkb.kb.nl/'. 
   For example, it will match 'http://www.oxfordmusiconline.com.access.authkb.kb.nl/' and 
   'https://doi-org.access.authkb.kb.nl/', as well as any other URL that fits this pattern.
   """
   pattern4 = r'https?://(www\.|galerij\.|blog\.)?kb\.nl/'
   """
   Pattern 4 matches URLs starting with either 'http://' or 'https://', optionally followed by 'www.', 'galerij.', or 'blog.', 
   and then 'kb.nl/'
   """
   if re.match(pattern1, url):
       return False
   elif re.match(pattern2, url):
       return False
   elif re.match(pattern3, url):
       return False
   elif re.match(pattern4, url):
       return True
   else:
       return False

def is_kbcollections_url(url):

   """
   Checks if the URL matches the pattern for the KB collections website (collecties.kb.nl)
   These can be
   Starting with
   * http(s)://collecties.kb.nl/
   * http(s)://(www.)kb.nl/themas/
   * http(s)://(www.)kb.nl/en/themes/
   * http(s)://(www.)kb.nl/bc/koopman
   But excluding URLs that start with:
   * None
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   # Pattern made by ChatGPT
   pattern = r'https?://((www\.)?kb\.nl/(bc/koopman|themas/|en/themes/)|collecties\.kb\.nl/)'
   """
   This pattern matches URLs that start with either 'http://' or 'https://', followed by one of the following:
   '(www.)kb.nl/themas/', '(www.)kb.nl/bc/koopman/', 'kb.nl/themas/', 'www.kb.nl/en/themes/', or 'kb.nl/en/themes/' or 'collecties.kb.nl/'.
   """
   return re.match(pattern, url) is not None
                        1. Delpher

def is_delpher_static_url(url):

   """
   Checks if the URL matches the pattern for 'Delpher static'.
   These can be
   * Exactly http(s)://(www.)delpher.nl(/nl)(/) --> pattern1
   * Starting with http(s)://(www.)delpher.nl/platform|over-delpher|thema) --> pattern2
   * Starting with http(s)://(www.)delpher.nl/nl/(platform|over-delpher|thema) --> pattern2
   But excluding URLs that start with:
   * None
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern1 = r'^https?://(www\.)?delpher\.nl(/nl)?/?$'
   """
   This regular expression pattern1 matches URLs that start with either 'http://' or 'https://', optionally followed by 'www.', 
   followed by 'delpher.nl', and optionally ending with a single slash. It does not match any other characters or paths
   beyond the domain name. 
   Examples of URLs it would match include: http://delpher.nl, https://delpher.nl, http://www.delpher.nl, 
   https://www.delpher.nl, http://delpher.nl/, https://delpher.nl/
   """
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?(platform|over-delpher|thema)'
   """
   This regular expression pattern2 matches URLs that start with either 'http://' or 'https://', optionally followed 
   by 'www.', followed by 'delpher.nl/', optionally followed by 'nl/', and then immediately followed by 
   either 'platform', 'over-delpher', or 'thema'. It specifically matches URLs that navigate to these three sections 
   of the 'delpher.nl' website, with or without the 'nl/' path segment. 
   Examples of URLs it would match include: http://delpher.nl/platform, https://delpher.nl/over-delpher, 
   http://www.delpher.nl/thema, https://www.delpher.nl/nl/platform, http://delpher.nl/nl/over-delpher, 
   https://delpher.nl/nl/thema
   It would not match URLs that do not have 'platform', 'over-delpher', or 'thema' immediately following 
   'delpher.nl/' or 'delpher.nl/nl/', or URLs with additional paths or characters beyond these specific terms.
   """
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   else:
       return False


def is_delpher_newspapers_basic_url(url):

   """
   Checks if the URL matches the pattern for 'Delpher Kranten 1618-1995 (Basiscollectie)'.
   These can be
   * Statice patterns
     - Starting with http(s)://kranten.(kb|delpher).nl(/)
     - Starting with http(s)://(www.)delpher.nl/kranten(/)
     - Starting with http(s)://(www.)delpher.nl/nl/kranten(/)
   * Resolver newspapers URL patterns
     - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherNewspapersResolverIDs}, where DelpherNewspapersResolverIDs = ['ddd'.......MMWFA01']
   (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=DDD_krantnr&x-facets=facets:mdoSet)
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   # Static patterns
   pattern1 = r'^https?://kranten.(kb|delpher)\.nl/?'
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?kranten/?'
   DelpherNewspapersResolverIDs = ['ddd', 'ABCDDD', 'KBDDD02', 'KBNRC01', 'KBPERS01', 'MMCC01', 'MMCC02', 'MMCODA01','MMCODA02', 'MMDA03',
                                   'MMECAL02', 'MMGARO01', 'MMGASL01', 'MMGAVL01', 'MMGAZS01', 'MMGEM01', 'MMHCO01', 'MMHCO02',
                                   'MMIISG05', 'MMIISG07','MMIISG18', 'MMKB04', 'MMKB08', 'MMKB12', 'MMKB15', 'MMKB19', 'MMKB23', 'MMKB27',
                                   'MMKB32', 'MMMAAS01', 'MMMHW01', 'MMNHA02', 'MMNHA03', 'MMNIOD04', 'MMNIOD05', 'MMPM05', 'MMRANM02',
                                   'MMRANM03', 'MMRANMG01', 'MMRAZ02', 'MMRAZ03', 'MMRHCE01', 'MMRHCE02', 'MMRHCG03', 'MMSAA06', 'MMSAB03',
                                   'MMSAB04', 'MMSADB01','MMSAEN01', 'MMSAEN02', 'MMSAK01', 'MMSARO02', 'MMSAVL02', 'MMSHCL03', 'MMTELE01',
                                   'MMTRES02', 'MMTRES03','MMTRES04', 'MMUBTB04', 'MMUBWA01', 'MMVEEN01', 'MMVEEN02', 'MMWFA01']
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   else:
       for id in DelpherNewspapersResolverIDs:
           pattern3 = rf'{resolverprefix}{id}'
           if re.match(pattern3, url):
               return True
       return False

def is_delpher_newspapers_external_url(url):

   """
   Checks if the URL matches the pattern for 'Externe regionale kranten'.
   These can be starting with
   * Static patterns: http(s)://(www.)delpher.nl/nl/regio
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern = r'^https?://(www\.)?delpher\.nl/(nl/)?regio/?'
   return re.match(pattern, url) is not None


def is_delpher_books_basic_url(url):

   """
   Checks if the URL matches the pattern for 'Boeken 17e t/m 20e eeuw (Basiscollectie)'.
    These can be
   * Statice patterns
     - Starting with http(s)://boeken.(kb|delpher).nl(/)
     - Starting with http(s)://(www.)delpher.nl/boeken/ --> compulsory trailing '/' to prevent matching with 'boeken1'
     - Starting with http(s)://(www.)delpher.nl/nl/boeken/  --> compulsory trailing '/' to prevent matching with 'boeken1'
   * Resolver boeken URL patterns
     - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherBooksResolverIDs}, where DelpherBooksResolverIDs = ['dpo', 'DEJONG', ...., 'SAB01']
   (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=BOEKEN_boek&x-facets=facets:mdoSet)
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   # Static patterns
   pattern1 = r'^https?://boeken.(kb|delpher)\.nl/?'
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?boeken/'
   DelpherBooksResolverIDs = ['dpo','DEJONG','GBR02','KBEU7475','KBITA01','KBTRES01','KONB10','KONB15','MMATR03','MMATR08',
                             'MMCMC01','MMKB02','MMKB02A','MMKB02B','MMKB05','MMKB06','MMKB11','MMKB18','MMKB18A','MMKB18B',
                             'MMKB18C','MMKB18D','MMKB21','MMKB22','MMKB24','MMKB25','MMKB28','MMKB31','MMKIT03','MMMVC01',
                             'MMNIOD07','MMSFKB02','MMSFUBA02','MMSFUBU02','MMTSGG01','MMTUA01','MMTUK01','MMUBA08','MMUBA09',
                             'MMUBL07','MMUBVU02','MMUBVU05','NIOD02','NIOD05','NOM01','PRB01','SAB01']
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   else:
       for id in DelpherBooksResolverIDs:
           pattern3 = rf'{resolverprefix}{id}'
           if re.match(pattern3, url):
               return True
       return False


def is_delpher_books_google_url(url):

   """
   Checks if the URL matches the pattern for 'Boeken Google'.
   These can be
   * Statice patterns
     - Starting with http(s)://boeken1.(kb|delpher).nl(/)
     - Starting with http(s)://(www.)delpher.nl/boeken1(/)
     - Starting with http(s)://(www.)delpher.nl/nl/boeken1(/)
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern1 = r'^https?://boeken1.(kb|delpher)\.nl/?'
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?boeken1/?'
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   else:
       return False

def is_delpher_magazines_url(url):

   """
   Checks if the URL matches the pattern for 'Tijdschriften 19e en 20e eeuw'.
   These can be
   * Statice patterns
     - Starting with http(s)://tijdschriften.(kb|delpher).nl(/)
     - Starting with http(s)://(www.)delpher.nl/tijdschriften(/)
     - Starting with http(s)://(www.)delpher.nl/nl/tijdschriften(/)
   * Resolver tijdschriften URL patterns
     - Starting with http(s)://resolver.kb.nl/resolve?urn={DelpherMagazinesResolverIDs}, where DelpherMagazinesResolverIDs = ['dts','CBS_EXT',... 'OBA01']
      (Taken and modified from https://jsru.kb.nl/sru/sru?query=*&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=0&recordSchema=ddd&x-collection=DTS_document&x-facets=facets:mdoSet
     Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   # Static patterns
   pattern1 = r'^https?://tijdschriften.(kb|delpher)\.nl/?'
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?tijdschriften/?'
   DelpherMagazinesResolverIDs = ['dts','CBS_EXT','GMDH01','KBDBNL02','KBDBVL02','KBDC001','KBKWG02','KBNA001','KBVRNL01','MMAD01',
                                     'MMALET01','MMALET02','MMATR01','MMATR02','MMATR05','MMATR06','MMATR07','MMATR09','MMAVB01','MMBOY01',
                                     'MMBPH01','MMBZK02','MMCBS01','MMCBS02','MMCBS03','MMCMC03','MMCMC05','MMCSS01','MMDA04','MMEGG01','MMENLK02',
                                     'MMENLK03','MMENLK04','MMEUR03','MMEYE01','MMEYE02','MMGAMS01','MMGAMS02','MMGAZS02','MMHCC01','MMHFC01',
                                     'MMHNI01','MMHNI03','MMIISG02','MMIISG06','MMIISG08','MMIISG10','MMIISG13','MMIISG15','MMIISG17','MMIISG20','MMIISG21',
                                     'MMKB07','MMKB10','MMKB13','MMKB14','MMKB16','MMKB26','MMKB30','MMKB37','MMKDC03','MMKDC04','MMKDC05','MMKDC07','MMKDC09',
                                     'MMKDC10','MMKIT04','MMKITLV3','MMKNAU01','MMKNGU01','MMKNMP01','MMKNMP02','MMKNRB01','MMKNTB01','MMKNVB01','MMKPSV01',
                                     'MMKVLO01','MMKVLO02','MMKWG01','MMLIND01','MMMI02','MMNA12','MMNAT03','MMNDR01','MMNFM01','MMNHV01','MMNIBG01','MMNIOD08',
                                     'MMNMM02','MMNPZ01','MMNVOG01','MMNVOG02','MMOBDH01','MMOHKT01','MMPM02','MMPM03','MMPM04','MMRAA02','MMRHCG01','MMRHCG02',
                                     'MMRKD09','MMRMO02','MMSARO01','MMSAVL01','MMSHCL02','MMSPOM01','MMTEY01','MMTUK02','MMTUK03','MMUBA10','MMUBA13','MMUBA15',
                                     'MMUBA16','MMUBL08','MMUBL09','MMUBMA01','MMUBMA02','MMUBTB01','MMUBTB02','MMUBTB03','MMUBU02','MMUBVU04','MMUBVU06','MMUBVU07',
                                     'MMUBVU08','MMUBWA02','MMUBWA03','MMUBWA04','MMUBWA05','MMUTRA01','MMUTRA03','MMVRED01','MMZAH01','MMZAH03','MMZB04','MMZEND01',
                                     'MMZEND02','MMZOU01','NIOD07','NIOD09','OBA01']
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   else:
       for id in DelpherMagazinesResolverIDs:
           pattern3 = rf'{resolverprefix}{id}'
           if re.match(pattern3, url):
               return True
       return False


def is_delpher_radiobulletins_url(url):

   """
   Checks if the URL matches the pattern for 'Radiobulletins van het ANP'.
   These can be
   * Statice patterns
     - Starting with http(s)://(anp|radiobulletins).(kb|delpher).nl(/)
     - Starting with http(s)://(www.)delpher.nl/radiobulletins(/)
     - Starting with http(s)://(www.)delpher.nl/nl/radiobulletins(/)
   * Resolver radiobulletins URL patterns
     - Starting with http(s)://resolver.kb.nl/resolve?urn=anp
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern1 = r'^https?://(anp|radiobulletins).(kb|delpher)\.nl/?'
   pattern2 = r'^https?://(www\.)?delpher\.nl/(nl/)?radiobulletins/?'
   pattern3 = rf'^{resolverprefix}anp:'
   if re.match(pattern1, url):
       return True
   elif re.match(pattern2, url):
       return True
   elif re.match(pattern3, url):
       return True
   else:
       return False


def is_databibnl_url(url):

   """
   Checks if the URL starts with one of the specified data.bibliotheken.nl patterns.
   These can be
   * Starting with http(s)://data.bibliotheken.nl/
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern = r'https?://data\.bibliotheken\.nl/?'
   return re.match(pattern, url) is not None
                        1. Overige eigenstandige KB-diensten

def is_dbnl_url(url):

   """
   Checks if the URL starts with one of the specified DBNL patterns.
   These can be
   * Starting with http(s)://dbnl.nl(.org)/
   * Starting with http(s)://*.dbnl.nl(.org)/ (with *. being non-empty)
   * Starting with http(s)://resolver.kb.nl/resolve?urn=dbnl
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern = rf'(https?://(.*\.)?dbnl\.(nl|org)|{resolverprefix}dbnl:)'
   return re.match(pattern, url) is not None

def is_gvn_url(url):

   """
   Checks if the URL starts with one of the specified Geheugen (van Nederland) (GvN) patterns.
   These can be
   * Starting with "http(s)://geheugen.delpher.nl"
   * Starting with "http(s)://(www.)geheugenvannederland.nl
   * Starting with "http(s)://resolver.kb.nl/resolve?urn=urn:gvn:
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   # pattern = r' ==> raw string
   # pattern = f' ==> formatted string
   pattern = rf'^(https?://geheugen\.delpher\.nl|https?://(www\.)?geheugenvannederland\.nl|{resolverprefix}urn:gvn:)'
   """ Explanation of the pattern (explainer from ChatGPT):
    1. `https?://geheugen\.delpher\.nl`
       - `https?://`: Matches 'http://' or 'https://'.
       - `geheugen\.delpher\.nl`: Matches 'geheugen.delpher.nl'. The backslashes `\` escape the dots, as dots are special characters in regular expressions.
    2. `https?://(www\.)?geheugenvannederland\.nl`
       - `https?://`: Matches 'http://' or 'https://'.
       - `(www\.)?`: Optionally matches 'www.'.
       - `geheugenvannederland\.nl`: Matches 'geheugenvannederland.nl'.
    3. `{resolverprefix}urn:gvn:`
       - `https?://`: Matches 'http://' or 'https://'.
       - `resolver\.kb\.nl/resolve\?urn=urn:gvn:`: Matches 'resolver.kb.nl/resolve?urn=urn:gvn:'. Here, the question mark `?` and the colon `:` are escaped because they have special meanings in regular expressions.
    The entire pattern is enclosed in parentheses `(` and `)` with the OR operator `|` used to separate the different URL formats. This regex will match any URL that starts with one of the three specified formats.
   """
   return re.match(pattern, url) is not None

def is_kbcat_url(url):

   """
   Checks if the URL starts with one of the specified patterns for the KB catalogus.
   These can be
   * Starting with "http://opc4.kb.nl"
   * Starting with "http(s)://resolver.kb.nl/resolve?urn=PPN:
   * Starting with "https://opc-kb.oclc.org"
   * Starting with "https://webggc.oclc.org/cbs/" and optionally "DB=2.37" somewhere after that in the URL
   Args:
   - url (str): The URL to check.
   Returns:
   - bool: True if the URL matches one of the patterns, False otherwise.
   """
   pattern = rf'^(https?://opc4\.kb\.nl|{resolverprefix}PPN:|https?://opc-kb\.oclc\.org|https?://webggc\.oclc\.org/cbs/.*(DB=2\.37)?.*$)'
   """
    Explanation of the pattern (explainer from ChatGPT):
      http://opc4\.kb\.nl: Matches URLs starting with "http://opc4.kb.nl".
     https?://resolver\.kb\.nl/resolve\?urn=PPN:: Matches URLs starting with "http://resolver.kb.nl/resolve?urn=PPN:" or "https://resolver.kb.nl/resolve?urn=PPN:".
     https://opc-kb\.oclc\.org: Matches URLs starting with "https://opc-kb.oclc.org".
     https://webggc\.oclc\.org/cbs/.*DB=2\.37: Matches URLs starting with "https://webggc.oclc.org/cbs/" and containing "DB=2.37" anywhere in the URL.
    
     Each pattern is separated by the | operator, which acts as an OR in regular expressions.
     The ^ at the beginning ensures the pattern matches from the start of the URL.
    Note that . is escaped as \. because it's a special character in regular expressions, and .* is used to match any characters (including none) between "cbs" and "DB=2.37" in the last URL pattern.
   """
   return re.match(pattern, url) is not None

def is_mmdc_url(url):

   pattern = r'https?://(www\.)?mmdc\.nl/?'
   return re.match(pattern, url) is not None

def is_litges_url(url):

   """
   Checks if the URL starts with one of the specified literatuurgeschiedenis.nl or .org patterns.
   """
   pattern = r'https?://(www\.)?literatuurgeschiedenis\.(nl|org)/?'
   return re.match(pattern, url) is not None

def is_jsru_url(url):

   pattern = r'https?://j?sru\.kb\.nl/?'
   return re.match(pattern, url) is not None

def is_authkb_url(url):

   pattern = r'https?://.*\.authkb\.kb\.nl/'
   return re.match(pattern, url) is not None
                    1. Oude KB-diensten (legacy)

def is_bibliopolis_url(url):

   pattern = r'https?://(www\.)?bibliopolis\.nl/?'
   return re.match(pattern, url) is not None

def is_dbng_url(url):

   pattern = r'https?://(www\.)?dbng\.nl/?'
   return re.match(pattern, url) is not None

def is_mim_url(url):

   pattern = r'https?://manuscripts\.kb\.nl/?'
   return re.match(pattern, url) is not None

def is_wilc_url(url):

   pattern = r'https?://watermark\.kb\.nl/?'
   return re.match(pattern, url) is not None

def is_poortman_url(url):

   pattern = r'https?://poortman\.kb\.nl/?'
   return re.match(pattern, url) is not None

def is_ibl_url(url):

   pattern = r'https?://ibl\.kb\.nl/?'
   return re.match(pattern, url) is not None
                                                  1. BLOCK 2:For OB services #########################
              1. Main public OB sites

def is_biebnl_url(url):

   # http://leesplein.bibliotheek.nl/assets/.....
   # http://stichting.bibliotheek.nl/
   # http://www.bibliotheek.nl/luisterboeken
   pattern = r'https?://(.*\.)?bibliotheek\.nl/?'
   return re.match(pattern, url) is not None

def is_onlinebieb_url(url):

   #https://www.onlinebibliotheek.nl/e-books.html
   pattern = r'https?://(.*\.)?onlinebibliotheek\.nl/?'
   return re.match(pattern, url) is not None

def is_jeugdbieb_url(url):

   #https://www.jeugdbibliotheek.nl/catalogus.catalogus.html?q=Coby%20Leeuwenburgh-Kloosterman
   #https://12-15.jeugdbibliotheek.nl/lezen/informatie-over-schrijvers/gideon-samson.html
   pattern = r'https?://(.*\.)?jeugdbibliotheek\.nl/?'
   return re.match(pattern, url) is not None
          1. Pro OB sites

def is_biebnetwerk_url(url):

   # https://www.bibliotheeknetwerk.nl/artikel/bijzondere-bibliotheken
   pattern = r'https?://(.*\.)?bibliotheeknetwerk\.nl/?'
   return re.match(pattern, url) is not None
          1. Defunct OB sites, offline now, 404s

def is_digeta_url(url):

   # http://www.digitaleetalages.nl/dam/sinterklaas/zwarte-piet/.....
   # https://www.digitaleetalages.nl/thema/amsterdam/175-jaar-artis/artis-en-de-functieverandering-van-dierentuinen.html
   pattern = r'https?://(.*\.)?digitaleetalages\.nl/?'
   return re.match(pattern, url) is not None

def is_litplein_url(url):

   # http://www.literatuurplein.nl/persdetail.jsp?persId=644013
   # http://literatuurplein.nl/
   pattern = r'https?://(.*\.)?literatuurplein\.nl/?'
   return re.match(pattern, url) is not None

def is_leesplein_url(url):

   # https://www.leesplein.nl/LL_plein.php?submenu=set_set&id=135
   # http://www.leesplein.nl/LL_plein.php?submenu=set_set&id=7681
   pattern = r'https?://(.*\.)?leesplein\.nl/?'
   return re.match(pattern, url) is not None


========= servicesnames_dict.json =============[bewerken | brontekst bewerken]

{

 "kb_services": {
   "kbsite": {
     "kbsite": "KB.nl",
     "kbmain": "KB website",
     "kbcollections": "KB Collecties website"
   },
   "delpher": {
     "delpher": "Delpher",
     "delpher_static": "Delpher statische paginas",
     "delpher_newspapers_basic": "Kranten 1618-1995 (Basiscollectie)",
     "delpher_newspapers_external": "Externe regionale kranten",
     "delpher_books_basic": "Boeken 17e t/m 20e eeuw (Basiscollectie)",
     "delpher_books_google": "Boeken Google",
     "delpher_magazines": "Tijdschriften 19e en 20e eeuw",
     "delpher_radiobulletins": "Radiobulletins van het ANP"
   },
   "dbnl": {
     "dbnl": "DBNL"
   },
   "databibnl": {
     "databibnl": "data.bibliotheken.nl"
   },
   "small": {
     "small": "Kleine KB-diensten",
     "litges": "Literatuurgeschiedenis",
     "kbcat": "KB catalogus",
     "gvn": "Geheugen (van Nederland)",
     "mmdc": "MMDC",
     "jsru": "(j)SRU service",
     "authkb": "KB authentication service"
   },
   "legacy": {
     "legacy": "Oude KB-diensten (legacy)",
     "bibliopolis": "Bibliopolis",
     "dbng": "Digitale Bibliografie Nederlandse Geschiedenis",
     "mim": "Middeleeuwse Verluchte Handschriften (MIM)",
     "wilc": "Watermarks in Incunabula printed in the Low Countries (WILC)",
     "poortman": "Wijsbegeerte in Nederland (Poortmans repertorium)",
     "ibl": "Artikelen uit tijdschriften (IBL)"
   }
 },
 "ob_services": {
   "main": {
     "main": "Hoofddiensten OB",
     "biebnl": "Bibliotheek.nl",
     "onlinebieb": "Online bibliotheek",
     "jeugdbieb": "Jeugdbibliotheek"
   },
   "pro": {
     "pro": "Diensten voor OB-professionals",
     "biebnetwerk": "Bnetwerk"
   },
   "defunct": {
     "defunct": "Voormalige OB-diensten",
     "leesplein": "Leesplein",
     "litplein": "Literatuurplein",
     "digeta": "Digitale etalages"
   }
 }

}


=========================[bewerken | brontekst bewerken]