"""
Copyright (c) 2022 theleekycauldron
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
"""
import pywikibot as pwb
from pywikibot import pagegenerators
import re
import requests
import datetime
import random
threshold = [600,1000]
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
site = pwb.Site("en","wikipedia")
tag = "[[[User:GalliumBot#vandyke|vandyke]] v2.3.2]"
class Article:
def __init__(self,title,alts=None,views=0,vph=0,background=0,background_vph=0,error=False):
self.title = title
self.obj = pwb.Page(site,self.title)
self.alts = [] if alts is None else alts
self.views = views
self.vph = vph
self.background = background
self.background_vph = background_vph
self.error = error
def get_alts(self,timeslots):
timeslots = [pwb.Timestamp.fromisoformat(timeslot.strftime("%Y-%m-%dT%H:%M:%S")) for timeslot in timeslots]
for revision in self.obj.revisions(starttime=timeslots[1],endtime=timeslots[0]):
comment = revision.comment.split(" ")
if comment[1:3] == ["moved","page"] and comment[3][:2] == "[[":
i = 3
while comment[i][-2:] != "]]":
i += 1
alt = " ".join(comment[3:i+1])[2:-2]
if alt not in self.alts and alt != self.title:
self.alts.append(alt)
def sanitize(self,title=None):
if title is None:
title = self.title
replacer = {
" ": "_",
" ": "_",
"/": "%2F",
"?": "%3F"
}
# Create a regular expression from the dictionary keys
regex = re.compile("(%s)" % "|".join(map(re.escape, replacer.keys())))
# For each match, look-up corresponding value in dictionary
return regex.sub(lambda mo: replacer[mo.string[mo.start():mo.end()]], title)
def get_views(self,title,dates,raw_date,time,jitter):
jitterbug = f"?max-age={random.randint(1,1000)}" if jitter else ""
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{self.sanitize(title=title)}/daily/{dates[0]}/{dates[1]}{jitterbug}"
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
response = requests.get(url=url,headers=headers).json()
try:
viewsarr = [r["views"] for r in response["items"]]
datesarr = [r["timestamp"] for r in response["items"]]
except KeyError as e:
self.error = True
print(url,response,e)
return
date = datetime.datetime.strftime(raw_date,"%Y%m%d00")
try:
ind = datesarr.index(date)
if ind < 2:
viewsarr = [viewsarr[1-ind]]*(2-ind) + viewsarr #complicated bit of padding
ind = 2
elif ind == len(viewsarr)-1:
viewsarr.append(viewsarr[ind-1])
except Exception as e:
self.error = True
print(url,response,e)
return
self.background += (viewsarr[ind-1]+min(viewsarr[ind-2],viewsarr[ind+1]))/2
self.views += viewsarr[ind] - self.background
self.vph += 3600*self.views/time.total_seconds()
self.background_vph += 3600*self.background/time.total_seconds()
print(f"{self.title}: {self.vph}")
class Hook:
def dates_of_interest(self):
self.date = self.timeslots[0] + (self.timeslots[1]-self.timeslots[0])/2
self.dft = self.date + datetime.timedelta(days=1) if self.date.hour>=12 else self.date
self.date = self.date.replace(hour=0,minute=0)
self.dft = self.dft.replace(hour=0,minute=0)
if self.timeslots[0].day == self.timeslots[1].day: #start/end on the same day (12-hour pt. 1)
self.time = self.timeslots[1] - self.timeslots[0]
else: #return largest segment
if self.timeslots[1] - self.dft > self.dft - self.timeslots[0]:
self.time = self.timeslots[1] - self.dft
self.timeslots[0] = self.dft
else:
self.time = self.dft - self.timeslots[0]
self.timeslots[1] = self.dft
return [self.date - datetime.timedelta(days=5),self.date + datetime.timedelta(days=3)]
def get_views(self,jitter):
dates = [datetime.datetime.strftime(date,"%Y%m%d00") for date in self.dates_of_interest()]
for article in self.articles:
article.get_views(article.title,dates,self.date,self.time,jitter)
try:
article.get_alts(self.timeslots)
except pwb.exceptions.NoPageError as e:
print(e)
pass
for alt in article.alts:
article.get_views(alt,dates,self.date,self.time,jitter)
self.total_views = sum(article.views for article in self.articles)
self.total_vph = sum(article.vph for article in self.articles)
self.total_background_vph = sum(article.background_vph for article in self.articles)
self.stats = self.total_vph >= self.threshold
if len(self.articles)>1:
self.articles.sort(key=lambda x:x.vph,reverse=True)
def notify(self):
pages = list(pagegenerators.SearchPageGenerator(f'insource:"==DYK for {self.articles[0].title}=={{{{ivmbox |image = Updated DYK query.svg"',total=5,namespaces=["User talk"],site=site))
for page in pages:
if "/" in page.title():
continue
pagetext = page.text.splitlines()
ind = pagetext.index(f"==DYK for {self.articles[0].title}==")
if any(["{{DYK views" in line for line in pagetext[ind:ind+11]]):
continue
pagetext.insert(ind+6,f'{{{{DYK views|{round(self.total_views):,}|{round(self.total_vph,1):,}|{datetime.datetime.strftime(datetime.datetime.now(),"%B %Y")}|{self.articles[0].title}}}}} ~~~~')
page.text = "\n".join(pagetext)
page.save(summary=f"/* DYK for {self.articles[0].title} */ your hook reached {round(self.total_views):,} views! {tag}",botflag=True)
def use_background(self,i): #unpythonic, but easy to fiddle with
if self.articles[i].background >= 1000:
return True
if self.articles[i].views < 0:
return True
if self.total_vph<self.threshold and self.total_vph+self.total_background_vph>=self.threshold and i==0:
return True
return False
def __repr__(self):
res = ""
for i in range(len(self.articles)):
article = self.articles[i]
total = ""
alts = ""
if len(article.alts) == 1:
alts = f"|alts=[[{article.alts[0]}]]"
elif len(article.alts) == 2:
alts = f"|alts=[[{article.alts[0]}]] and [[{article.alts[1]}]]"
elif len(article.alts) > 2:
alts = ", ".join(f"[[{alt}]]" for alt in article.alts)
alts = "|alts="+alts[:-(4+len(article.alts[-1]))]+"and "+alts[-(4+len(article.alts[-1])):]
if i>0:
head = "{{DYK stats table multi"
if i == len(self.articles)-1:
total = f"\n{{{{DYK stats table multi total|{round(self.total_views):,}|{round(self.total_vph,1):,}}}}}"
image = ""
else:
if len(self.articles)>1:
head = f"{{{{DYK stats table multi begin"
else:
head = "{{DYK stats table row"
image = '|' + self.image
date = datetime.datetime.strftime(self.date,"%Y-%m-%d")
background = (f"|b={article.background:,}" if self.use_background(i) else "") if not article.error else f"|error=y"
articlecount = f"|{len(self.articles):,}" if head == '{{DYK stats table multi begin' else ''
hooktext = self.text if head != '{{DYK stats table multi' else ''
res += f"{head}|{article.title}{articlecount}{image}|{date}|{round(article.views):,}|{round(article.vph,1):,}|{hooktext}{background}{alts}}}}}{total}\n"
return res
def extract_articles(self):
text = re.findall(r"'''(.+?)'''",self.text)
text = [(expand_templates(a) if "{{" in a else a) for a in text]
self.articles = [a[0].capitalize() + a[1:] for a in re.findall(r"\[\[(?!Category:)([^\|\]#]+)"," ".join(text))] # standard extraction
self.articles += [a[0].capitalize() + a[1:] for a in re.findall(r"\[\[([^\|\]#]+)(?:\||\]\]|#)'''",self.text)] # missing entires because y'all CAN'T FORMAT SOMETIMES
if len(self.articles)>1:
self.articles = list(set(self.articles)) # rm duplicates
self.articles = [Article(article) for article in self.articles]
def __init__(self,text,timeslots,image,jitter):
self.text = text # "... that '''[[leek]]s''' are objectively the best vegetable, as opposed to '''[[carrot]]s'''?"
self.timeslots = timeslots # [datetime.datetime(2020,7,29,hour=0,minute=0),datetime.datetime(2020,7,29,hour=12,minute=0)]
self.image = image.replace("File:","") # "Leek.jpg" or ""
self.threshold = threshold[1] if self.image else threshold[0] #creates self threshold for background
self.extract_articles() # ["Leek", "Carrot"]
self.get_views(jitter) # {"Leek": 10253, "Carrot": 231}
def expand_templates(text):
s = requests.session()
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "expandtemplates",
"text": text,
"prop": "wikitext",
"format": "json"
}
r = s.get(url=url, params=params)
data = r.json()
return data["expandtemplates"]["wikitext"].replace(" "," ")
def generate_wikitext(archivepagename):
archivepage = pwb.Page(site,archivepagename)
wikitext = archivepage.text
if archivepagename != "Wikipedia:Recent additions":
monthyear = archivepagename.split("/")[1:]
if monthyear[1] == "December":
nextmonthyear = f"Wikipedia:Recent additions/{int(monthyear[0])+1}/January"
else:
nextmonthyear = f"Wikipedia:Recent additions/{monthyear[0]}/{months[months.index(monthyear[1])+1]}"
nextarchivepage = pwb.Page(site,nextmonthyear)
if nextarchivepage.text[:9].lower() == "#redirect":
nextarchivepage = pwb.Page(site,"Wikipedia:Recent additions")
wikitext = nextarchivepage.text[nextarchivepage.text.rindex("*''''"):] + "\n" + wikitext
return wikitext
def process_wikitext(wikitext,jitter):
wikiarr = wikitext.splitlines()
t1 = None
t2 = None
hooks = []
output = []
image = ""
setnum = 0
for line in wikiarr:
if " (UTC)'''" in line: #timestamps
t1 = t2
t2 = datetime.datetime.strptime(line,"*'''''%H:%M, %d %B %Y (UTC)'''''")
if t1 is None:
continue
print(f"==={t2} -> {t1}===")
for i in range(len(hooks)):
output.append(Hook(hooks[i],[t2,t1],image if i==0 else "",jitter and setnum<3))
hooks = []
image = ""
setnum += 1
elif "{{main page image" in line: #image
line = re.split("\||{{!}}",line)
try:
image = line[1][line[1].index("=")+1:]
except ValueError:
image = line[1]
elif "* ... " in line or "*..." in line: #hook
line = line[line.index("..."):]
hooks.append(line)
output.sort(key = lambda x:x.total_vph, reverse=True)
return output
def process_data(total,archivepagename):
try:
monthyearlist = archivepagename.split("/")[1:]
monthyear = monthyearlist[1] + " " + monthyearlist[0]
yeartarget = "/"+ monthyearlist[0]
monthyeartarget = f"/{monthyearlist[0]}/{monthyearlist[1]}"
except IndexError as e:
monthyear = datetime.datetime.strftime(datetime.datetime.now(),"%B %Y")
yeartarget = "/"+monthyear[monthyear.index(" ")+1:]
monthyeartarget = "/"
data = {
"Total": total,
"Imaged": list(filter(lambda hook:hook.image != "",total)),
"Nonimaged": list(filter(lambda hook:hook.image == "",total))
}
def thresholdpass(d):
return sum([a.stats for a in d])
sections = {
"Main": "==To main summary page==\n{{DYK stats monthly summary table|",
"Total": f"==To total table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Total]].\n{{|class=\"wikitable\"</noinclude>\n|-",
"Imaged": f"==To imaged table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Imaged]].\n{{|class=\"wikitable\"</noinclude>\n|-",
"Nonimaged": f"==To non-imaged table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Non-imaged]].\n{{|class=\"wikitable\"</noinclude>\n|-"
}
def low(d):
return (f"{round(d[-1].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[-1].articles]))
def median(d):
if len(d)%2==0:
a = [len(d)//2,len(d)//2-1]
return (f"{round((d[a[0]].total_vph+d[a[1]].total_vph)/2,1):,}","<br/>".join([", ".join([f"[[{x.title}]]" for x in d[n].articles]) for n in a]))
else:
a = (len(d)-1)//2
return (f"{round(d[a].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[a].articles]))
def high(d):
return (f"{round(d[0].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[0].articles]))
funcs = {
"Low": low,
"Median": median,
"High": high
}
for category in ["Total","Imaged","Nonimaged"]:
sections[category] += f"\n|[[Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{monthyeartarget}|{monthyear}]]"
tp = thresholdpass(data[category])
lc = len(data[category])
sections[category] += f"\n| {lc}"
sections[category] += f"\n| {tp}"
sections[category] += f"\n| {round(100*tp/lc,1):,}"
for stat in ["Low","Median","High"]:
temp = f"\n{{{{DYK stats monthly summary table row|{stat}"
for category in ["Nonimaged","Imaged","Total"]:
res = funcs[stat](data[category])
sections[category] += f"\n| {res[0]}"
sections[category] += f"\n| {res[1]}"
temp += f"|{res[0]}|{res[1]}"
sections["Main"] += temp + "}}"
return f"""{sections["Main"]}
}}}}
{sections["Total"]}
<noinclude>|}}</noinclude>
{sections["Imaged"]}
<noinclude>|}}</noinclude>
{sections["Nonimaged"]}
<noinclude>|}}</noinclude>"""
def main(archivepagename="Wikipedia:Recent additions",jitter=True,edit=True,notify=None):
if notify is None:
notify = (archivepagename == "Wikipedia:Recent additions" and edit)
wikitext = generate_wikitext(archivepagename) #Grab wikitext from the archive page (and the next archive page, if relevant)
pageviews_data = process_wikitext(wikitext,jitter) #Process into a series of Hook objects
table = f"""{{{{Wikipedia:Did you know/Statistics/Tabs|4}}}}
{{{{Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders/Navigation}}}}
{{{{Excerpt|Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{archivepagename.replace("Wikipedia:Recent additions","")}/Summary|To main summary page|hat=no}}}}
{{{{clear}}}}
==Table==
{{{{DYK stats table|
{"".join([str(hook) for hook in pageviews_data])}}}}}""" #Write Hook objects into DYK stats table
statspage = pwb.Page(site,archivepagename.replace("Wikipedia:Recent additions","Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders"))
if statspage.text is not table:
statspage.text = table
statspage.save(summary=f"feedin' the bangtail {tag}") #editing into page
summary = process_data(pageviews_data,archivepagename) #Obtain summary data
summarypage = pwb.Page(site,f'Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{archivepagename.replace("Wikipedia:Recent additions","")}/Summary')
if summarypage.text is not summary:
summarypage.text = summary
summarypage.save(summary=f"feedin' the bangtail {tag}") #editing into page
if notify:
for hook in pageviews_data:
if hook.stats:
hook.notify() #notify nominator if past the threshold
if __name__ == "__main__":
main()