Gebruiker:DajasjBot/archivelinks.py
Uiterlijk
from bz2file import BZ2File
from lxml import etree
import os
import waybackpy
import pywikibot
import time
import re
from tqdm import notebook
from datetime import datetime
site = pywikibot.Site()
start = datetime.now()
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
def citetemplate_to_dict(citetemplate):
splitted_citetemplate = citetemplate[:-2].split("|")[1:]
dict_of_citetemplate = {}
for n, x in enumerate(splitted_citetemplate):
if "=" in x:
x_split = x.split("=", 1)
dict_of_citetemplate[x_split[0]] = x_split[1]
elif "]]" in x:
prev_x_split = splitted_citetemplate[n - 1].split("=", 1)
dict_of_citetemplate[prev_x_split[0]] += "|" + x
return dict_of_citetemplate
datedict = {
"januari": 1,
"februari": 2,
"maart": 3,
"april": 4,
"mei": 5,
"juni": 6,
"juli": 7,
"augustus": 8,
"september": 9,
"oktober": 10,
"november": 11,
"december": 12
}
# Manually download file
with BZ2File("nlwiki-latest-pages-articles.xml.bz2") as xml_file:
file = etree.iterparse(
xml_file, tag='{http://www.mediawiki.org/xml/export-0.10/}page')
for _, dump_page in notebook.tqdm(file):
# Get the text
dump_text = dump_page.findtext(
'{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text'
)
# Get the title
title = dump_page.findtext(
'{http://www.mediawiki.org/xml/export-0.10/}title')
# Check if dump has missing archive urls
list_citetemplates_dump = [
x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", dump_text)
if not re.search(
r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
x, re.IGNORECASE)
]
if len(list_citetemplates_dump) == 0:
continue
# REMOVE
title = "Gebruiker:DajasjBot/Kladblok"
# Get page from the live version
page = pywikibot.Page(site, title)
live_text = page.text
# Check protection
if "edit" in page.protection().keys():
if page.protection()["edit"][0] == "sysop":
continue
# UNCOMMENT
# # Check namespace
# if page.namespace().id != 0:
# continue
list_citetemplates_live = [
x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", live_text)
if not re.search(
r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
x, re.IGNORECASE)
]
# Check if also misses archiveurls
if len(list_citetemplates_live) == 0:
continue
changed = False
for citetemplate in list_citetemplates_live:
dict_of_citetemplate = citetemplate_to_dict(citetemplate)
dict_of_citetemplate = {
key.strip(): value.strip()
for key, value in dict_of_citetemplate.items()
}
# Get url
if 'url' in dict_of_citetemplate:
url = dict_of_citetemplate['url']
elif "URL" in dict_of_citetemplate:
url = dict_of_citetemplate['URL']
else:
continue
# Get retrieved date
if 'bezochtdatum' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['bezochtdatum']
elif 'accessdate' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['accessdate']
elif 'datumbezocht' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['datumbezocht']
elif 'datumgeraadpleegd' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['datumgeraadpleegd']
elif 'raadpleegdatum' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['raadpleegdatum']
elif 'access-date ' in dict_of_citetemplate:
retrievedate = dict_of_citetemplate['access-date']
else:
retrievedate = ""
wayback = waybackpy.Url(url, user_agent)
# Check if already archived
if wayback.total_archives() == 0:
# Archive page
try:
archive = wayback.save()
except Exception as e:
print(e)
print(url)
continue
else:
# Get archived page
retrievedate = retrievedate.strip()
if retrievedate != "":
if re.search(r"\d{4}-\d{2}-\d{2}", retrievedate):
year = retrievedate[:4]
month = retrievedate[5:7]
day = retrievedate[8:10]
elif re.search(r"\d{1,2}-\d{1,2}-\d{4}", retrievedate):
year = retrievedate.split("-")[-1]
month = retrievedate.split("-")[1]
day = retrievedate.split("-")[0]
elif re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
retrievedate, re.IGNORECASE):
month = re.search(
r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
"25 augustus 2021").group(2).lower()
if month in datedict.keys():
month = datedict[month]
else:
continue
day = re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
"25 augustus 2021").group(1)
year = retrievedate[-4:]
else:
print("date is wrong")
print(url)
print(retrievedate)
continue
try:
archive = wayback.near(year=year, month=month, day=day)
except Exception as e:
print(url)
print(e)
continue
else:
archive = wayback.newest()
# Create new template
new_dict_of_citetemplate = citetemplate_to_dict(citetemplate)
# Remove archive related parameters
new_dict_of_citetemplate = {
key: value
for key, value in new_dict_of_citetemplate.items()
if key.strip() not in [
"archiefdatum", "archivedate", "archive-date", "dodeurl",
"dode-url", "deadurl", "dead-url", "archiefurl",
"archiveurl", "archive-url"
]
}
new_dict_of_citetemplate['archiefurl'] = archive.archive_url
new_dict_of_citetemplate[
'archiefdatum'] = archive.timestamp.strftime("%Y-%m-%d")
new_dict_of_citetemplate['dodeurl'] = "nee"
new_citetemplate = citetemplate.split("|")[0] + "|" + "|".join([
key + "=" + value
for key, value in new_dict_of_citetemplate.items()
]) + "}}"
# Add to live_text
live_text = live_text.replace(citetemplate, new_citetemplate)
changed = True
if changed:
page.text = live_text
page.save(u"Archiefurl toegevoegd")
# Pause for rate limit
time.sleep(max(0, 60 - (datetime.now() - start).seconds))
start = datetime.now()
break
dump_page.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in dump_page.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]