Gebruiker:Edoderoobot/ongekoppelde-paginas.py
Uiterlijk
import json
import sys
import urllib.request, urllib.parse
import datetime
from time import strftime
import pywikibot
#from pywikibot import pagegenerators
#from pywikibot import pagegenerators as pg, textlib, WikidataBot
from pywikibot import textlib
maanden=['nl','januari','februari','maart','april','mei','juni','juli','augustus','september','oktober','november','december']
debugmodus = True
debugmodus = False
wikistr={'taxon':'','person':'','dp':'','list':'','misc':''}
wikiurl={'nl':u'Wikipedia:Wikidata/Ongekoppelde pagina\'s'}
skip_templates = ['','infobox/breedte','!!','0','afbeelding gewenst','infobox generiek','nl','nl-vlag','be','be-vlag','afbeeldingbreedte']
person_templates = ['acteur','artiest','artiest klassieke muziek','atleet','atletiek','auteur','filmregisseur','hoogleraar','medicus','journalist','kunstenaar','persoon','politicus nederland','presentator','sporter','voetballer','wielrenner']
disamb_templates = ['dpintro','dp']
nomination_templates = ['auteur','ne','nuweg','reclame','wb','wiu','samenvoegen naar','weg','samenvoegen']
mustbe='Navigatie infoboxen personen'
isdisamb='Q4167410'
def is_person_template(checktemplate):
site=pywikibot.Site()
page=pywikibot.Page(site,checktemplate,10)
for templ in page.templates():
if templ.title() in ['Sjabloon:Navigatie infoboxen personen','Sjabloon:Afbeelding gewenst persoon']:
return True
return False
def try2find_taxon(repo,searchstr):
"""
read Taxon template, find value on worms, match with P850
"""
site=pywikibot.Site()
page=pywikibot.Page(site,searchstr)
pagetext=page.get()
templates = textlib.extract_templates_and_params(pagetext)
i = 0
worms2find=''
for (templ, fielddict) in templates:
if templ=='Taxobox':
#taxobox = templ
for field, value in fielddict.items():
i+=1
#print("%d: [%s]: [%s]" % (i,field, value))
if field=='worms': worms2find=value
#print("worms2find: [%s]" % worms2find)
max=99
c=0
searchres = repo.search(searchstr,[0])
for onetaxon in searchres:
c+=1
if c>max: return c,None
wditem = pywikibot.ItemPage(repo,onetaxon.title())
wditem.get(get_redirect=True)
if 'P850' in wditem.claims:
thisworms = wditem.claims.get('P850')[0].getTarget()
if thisworms==worms2find:
return c,wditem.title()
return c,None
def try2find_person(repo,searchstr):
savegebdate = saveimdb = None
site=pywikibot.Site()
page=pywikibot.Page(site,searchstr)
pagetext=page.get()
templates = textlib.extract_templates_and_params(pagetext)
for (templ,fielddict) in templates:
for field,value in fielddict.items():
if field in ['geboortedatum']:
savegebdate = value
if field in ['imdb']:
saveimdb = value
searchres = repo.search(searchstr,[0])
c=0
for oneperson in searchres:
c += 1
wditem = pywikibot.ItemPage(repo,oneperson.title())
wditem.get(get_redirect=True)
if (saveimdb != None) and ('P345' in wditem.claims): #imdb
claims = wditem.claims.get('P345')
for thisclaim in claims:
thisimdb = thisclaim.getTarget().title()
if thisimdb.rfind(saveimdb):
return c,wditem.title()
if (savegebdate != None) and ('P569' in wditem.claims): #check if same birth date (full date)
try:
thisdate = wditem.claims.get('P569')[0].getTarget() #date of found wikidata-item
xstr = str(savegebdate).replace('[','').replace(']','') #date without [[]] linking brackets
ystr = '%d %s %d' % (thisdate.day,maanden[thisdate.month],thisdate.year) #str-date with Dutch named months
if xstr==ystr:
return c,wditem.title()
except:
pass
return c,None
def try2find_dp(repo,searchstr):
max=99
c=0
searchres = repo.search(searchstr,[0])
for oneitem in searchres:
c+=1
if (c>max): return c,None
wdpage=pywikibot.ItemPage(repo,oneitem.title())
wdpage.get()
if 'P31' in wdpage.claims:
claim31=wdpage.claims.get('P31')[0].getTarget().title()
if claim31==isdisamb:
if (wdpage.sitelinks):
return c,wdpage.title()
return 0,None
def action(pagename):
isPerson=False
isRedirect=False
isList=False
isDisambigue=False
isNominated=False
hasInfobox=None
gotInfobox=False
hasCategory=None
isTaxon=False
global wikistr
suggest_wd=None
level=0
site = pywikibot.Site('nl')
repo = site.data_repository()
page = pywikibot.Page(site,pagename)
for ptemplate in page.templates():
thistemplate = ptemplate.title()[9:]
if thistemplate.lower() in nomination_templates:
return
if (hasInfobox==None):
if (thistemplate=='Taxobox'):
hasInfobox = 'Taxobox'
isTaxon=True
if ((thistemplate[0:7]=='Infobox') and not(thistemplate.lower() in skip_templates)):
gotInfobox = True #found one, save name
hasInfobox = ptemplate.title()[17:]
isPerson = isPerson or is_person_template(thistemplate)
#isPerson = thistemplate[8:].lower() in person_templates
#if not(hasInfobox==None):
# isPerson = hasInfobox.lower() in person_templates
isDisambigue = thistemplate.lower() in disamb_templates
#isList = thistemplate.lower[0:7] in ['lijsten']
for pcategory in page.categories():
thiscat = pcategory.title()[10:]
if hasCategory==None:
if thiscat[0:9] != 'Wikipedia':
hasCategory = thiscat
if isTaxon:
level,suggest_wd = try2find_taxon(repo,pagename)
if isDisambigue:
level,suggest_wd = try2find_dp(repo,pagename)
if isPerson:
level,suggest_wd = try2find_person(repo,pagename)
if (suggest_wd != None):
suggest_wd = "[[:d:%s]]" % suggest_wd
if hasCategory==None: hasCategory='None'
if hasInfobox==None: hasInfobox='None'
onestr = "\n|-\n|[[%s]]\n|%s\n|%s\n|%s\n|%s\n|%s\n|%i\n" % (pagename.replace('_',' '),isDisambigue,isPerson,hasInfobox.replace('_',' '),hasCategory.replace('_',' '), suggest_wd, level )
if isTaxon: wikistr['taxon'] += onestr
elif isPerson: wikistr['person'] += onestr
elif isDisambigue: wikistr['dp'] += onestr
elif isList: wikistr['list'] += onestr
else:
if (gotInfobox):
#print("hasInfobox: %s" % hasInfobox)
if (hasInfobox in wikistr.keys()):
wikistr[hasInfobox] += onestr
else:
wikistr[hasInfobox] = onestr
else:
wikistr['misc'] += onestr
def main():
#exit() #avoid scheduled runs from crontab
dparray = []
global wikistr
mylanguage=u'nl'
myproject=u'wikipedia'
mydepth=u'12'
mycategory=u'Wikipedia:Doorverwijspagina'
mycategory=u'Nederlands persoon'
mycategory=u'Alles'
query = 'https://petscan.wmflabs.org/?'\
'language='+mylanguage+\
'&project='+myproject+\
'&depth='+mydepth+\
'&categories='+urllib.parse.quote_plus(mycategory)+\
'&combination=subset'\
'&negcats='\
'&ns%5B0%5D=1'\
'&larger='\
'&smaller='\
'&minlinks='\
'&maxlinks='\
'&before='\
'&after='\
'&max_age='\
'&show_redirects=no'\
'&edits%5Bbots%5D=both'\
'&edits%5Banons%5D=both'\
'&edits%5Bflagged%5D=both'\
'&templates_yes='\
'&templates_any='\
'&templates_no='\
'&outlinks_yes='\
'&outlinks_any='\
'&outlinks_no='\
'&sparql='\
'&manual_list='\
'&manual_list_wiki='\
'&pagepile='\
'&common_wiki=cats'\
'&format=json'\
'&output_compatability=catscan'\
'&sortby=none'\
'&sortorder=ascending'\
'&wikidata_item=without'\
'&wikidata_label_language='\
'®exp_filter='\
'&doit='\
'&interface_language=en'\
'&active_tab=tab_output'\
'&format=json'
#if (debugmodus):
# import pdb
if False: #or os.isatty(sys.stdin.fileno()): #detect run from cron or from console
action('Fred McLeod')
#action('Joo Kang-eun')
#action('Lijst van personages uit Smeris')
#action('Alias Jimmy Valentine')
#action('Achillas')
else:
print("get query")
response = urllib.request.urlopen(query)
print("process pages")
rawdate = response.read()
decoded=rawdate.decode('utf8')
dps=json.loads(decoded)
for dp in dps['*'][0]['a']['*']:
dparray.append(dp['title'])
#print(dp)
print("-------")
for i in range(0,len(dparray)):
print("%i - %s" % (i,dparray[i]))
action(dparray[i])
wikiString = (u'Dit zijn [[Speciaal:OngekoppeldePaginas|pagina\'s die niet gekoppeld zijn aan items]] minus de pagina\'s die genomineerd zijn voor verwijdering.\n\n'+\
u'Deze pagina wordt automatisch gegenereerd, handmatige updates hebben dus geen zin!\n\n'+\
u'aangemaakt op %s\n\n' +\
'{| class=\"wikitable sortable\"\n|-\n!Pagina || Dp || Persoon || Infobox || Categorie || Suggestie \n') % '{:%d-%m-%Y %H:%m}'.format(datetime.date.today())
print("=======")
for k in wikistr.keys():
print(wikistr[k])
wikiString+=wikistr[k]
wikiString += "\n|}\n"
if not debugmodus:
pass#pywikibot.Page(pywikibot.getSite('nl'), wikiurl['nl']).put(wikiString, comment='Update') #Save page
else:
print("<!----!>")
print(wikiString)
main()