#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2023 User:zzuuzz at English Wikipedia
"""
A script to check a MediaWiki bad image list for problems.
The main checks are:
Formatting of list items
Missing / redirected pages and files
Uses without a listed exception
Notes:
This script prints output to the terminal and doesn't make any changes.
Command line arguments are not supported.
Should work on most MediaWiki / Wikipedias
Requires:
python 3.6 +
install pywikibot module
Usage:
python3 <script name>, or maybe just: python <script name>
pwb <script name>
"""
import re
import sys
from typing import Dict, List, Set
import pywikibot
import pywikibot.data.api
# Configurable options:
SITE = "wikipedia:en" # Format like "wikipedia:en", "meta", or "wikidata"
BIL_PAGE = "MediaWiki:Bad image list" # Wiki page title
# Get an old revision by oldid; use False (or 0, None, "", etc) for latest:
# OLD_ID = 1065475922
OLD_ID = 0
# Pretty output flags:
FAIL = "\033[91mFAIL\033[m:"
INFO = "\033[94mINFO\033[m:"
SUCCESS = "\033[32mOK\033[m:"
# Here be dragons...
class BadImageListItem:
def __init__(self, linenum, image, exceptions) -> None:
self.linenum = linenum
self.title = image
self.exceptions = exceptions
@property
def link(self) -> pywikibot.Link:
return pywikibot.Link(self.title, site)
@property
def norm_title(self) -> str:
return pywikibot.Link(self.title, site).canonical_title()
class BadImageFileInfo:
def __init__(self, data: dict) -> None:
self.data = data
self.missing = "missing" in data
@property
def file_missing(self) -> bool:
if "imageinfo" in self.data:
for item in self.data["imageinfo"]:
if "filemissing" in item:
return True
return False
@property
def is_local_image(self) -> bool:
if "imagerepository" in self.data:
return self.data["imagerepository"] == "local"
return False
@property
def is_redirect(self) -> bool:
return self.data["title"] != self.target_canonical_title
@property
def target_canonical_title(self) -> str:
if "imageinfo" in self.data:
for revision in self.data["imageinfo"]:
if "canonicaltitle" in revision:
return revision["canonicaltitle"]
return ""
@property
def title(self) -> str:
return self.data["title"]
@property
def usage(self) -> Set[str]:
result: Set[str] = set()
if "fileusage" in self.data:
for item in self.data["fileusage"]:
if "title" in item:
result.add(item["title"])
return result
def load_fileinfo(filenames: List[str]) -> Dict[str, BadImageFileInfo]:
result: Dict[str, BadImageFileInfo] = dict()
batchsize = 50 # API has a normal lower request limit of 50 pages.
for i in range(0, len(filenames), batchsize):
end = i + batchsize
progress = int(len(result) / len(filenames) * 100)
print(f"\033[KGetting info ... {progress}%\r", end="")
qry_args = {
"fuprop": "title|redirect",
"iilimit": 1,
"iiprop": "badfile|canonicaltitle",
"titles": filenames[i:end],
}
qry_result = pywikibot.data.api.PropertyGenerator(
prop="imageinfo|fileusage", site=site, parameters=qry_args
)
for pagedata in qry_result:
result[pagedata["title"]] = BadImageFileInfo(pagedata)
print("\033[K\r", end="") # clear rolling status
return result
site = pywikibot.Site(SITE)
print(f"{INFO} Checking bad image list for {site.sitename}")
bil_page = pywikibot.Page(site, BIL_PAGE)
if not bil_page.exists():
sys.exit(f"No list found at {bil_page}")
if OLD_ID:
bil_lines = bil_page.getOldVersion(OLD_ID).splitlines()
else:
bil_lines = bil_page.text.splitlines()
if not bil_lines:
sys.exit("Empty list")
image_by_line: Dict[int, BadImageListItem] = dict()
image_by_name: Dict[str, List[BadImageListItem]] = dict()
line_num: int = 0
fatal_line_errors: List[int] = []
duplicates: Set[str] = set()
fileinfo: Dict[str, BadImageFileInfo] = dict()
# Build data dictionary
for line in bil_lines:
line_num += 1
if len(line) > 0 and line[0] == "*":
links = re.findall(r"\[\[:?([^\]]*)\]\]", line)
if links:
entry = BadImageListItem(line_num, links[0], links[1:])
image_by_line[line_num] = entry
# Add dup detection
if entry.norm_title not in image_by_name:
image_by_name[entry.norm_title] = []
image_by_name[entry.norm_title].append(entry)
if not image_by_line:
sys.exit("No entries found")
# Check list problems - piped links, namespace, duplicates
print(f"{INFO} Checking for namespace and link errors")
for line_num, bil in image_by_line.items():
if bil.link.anchor:
print(f"{FAIL} -> Error: Piped link: {bil.title} [{line_num}]")
fatal_line_errors.append(line_num)
if bil.link.namespace != site.namespaces.FILE:
print(f"{FAIL} -> Error: Wrong namespace: {bil.title} [{line_num}]")
fatal_line_errors.append(line_num)
# Add extra dup detection processing
if len(image_by_name[bil.norm_title]) > 1:
duplicates.add(bil.norm_title)
for line_num in fatal_line_errors:
del image_by_name[image_by_line[line_num].norm_title]
del image_by_line[line_num]
# Check duplicate file names
if duplicates:
print(f"{FAIL} {len(duplicates)} Duplicate file names found:")
for s in sorted(duplicates):
ln = [str(bil.linenum) for bil in image_by_name[s]]
print(f"-> {s} [{', '.join(ln)}]")
else:
print(f"{SUCCESS} No duplicate file names found")
# Normalize file names
for line, bil in image_by_line.items():
if bil.title != bil.norm_title:
msg = f"{INFO} Normalizable: {bil.title}"
msg += f" -> {bil.norm_title} [{line}]"
print(msg)
# Load file and exception info
print(f"{INFO} Checking file info")
fileinfo = load_fileinfo(list(image_by_name.keys()))
# Check for missing files
print(f"{INFO} Checking for missing files")
redlinks: List[BadImageFileInfo] = []
filemissing: List[BadImageFileInfo] = []
for info in fileinfo.values():
if info.missing and info.file_missing:
redlinks.append(info)
elif info.file_missing:
filemissing.append(info)
if redlinks:
print(f"{FAIL} {len(redlinks)} Red links found:")
for info in redlinks:
ln = [str(bil.linenum) for bil in image_by_name[info.title]]
print(f"-> {info.title} [{', '.join(ln)}]")
else:
print(f"{SUCCESS} No red links found")
if filemissing:
print(f"{FAIL} {len(filemissing)} Missing files (deleted on commons):")
for info in filemissing:
ln = [str(bil.linenum) for bil in image_by_name[info.title]]
print(f"-> {info.title} [{', '.join(ln)}]")
else:
print(f"{SUCCESS} No other missing files found")
# Check for local and unlisted commons redirects
print(f"{INFO} Checking for redirects")
local_redirects: List[BadImageFileInfo] = []
unlisted_commons_redirs: List[BadImageFileInfo] = []
for info in fileinfo.values():
if info.is_redirect and info.is_local_image:
local_redirects.append(info)
elif info.is_redirect and not info.is_local_image:
if info.target_canonical_title not in fileinfo:
unlisted_commons_redirs.append(info)
if local_redirects:
print(f"{FAIL} {len(local_redirects)} Local redirects found:")
for info in local_redirects:
ln = [str(bil.linenum) for bil in image_by_name[info.title]]
msg = f"-> {info.title} <- redirects to -> "
msg += f"{info.target_canonical_title} [{', '.join(ln)}]"
print(msg)
else:
print(f"{SUCCESS} No local redirects found")
additions: Set[str] = set()
if unlisted_commons_redirs:
print(f"{FAIL} {len(unlisted_commons_redirs)} Unlisted commons redirects:")
for info in unlisted_commons_redirs:
ln = [str(bil.linenum) for bil in image_by_name[info.title]]
msg = f"-> {info.title} <- redirects to -> "
msg += f"{info.target_canonical_title} [{', '.join(ln)}]"
additions.add(info.target_canonical_title)
print(msg)
else:
print(f"{SUCCESS} No unlisted commons redirects found")
if additions:
print(f"{INFO} {len(additions)} Possible additions for commons redirects:")
newinfo = load_fileinfo(list(additions))
sorted_additions = []
for k, info in newinfo.items():
current_uses: Set[str] = set()
for info2 in fileinfo.values():
if info.title == info2.target_canonical_title:
current_uses = info2.usage
break
msg = f"* [[:{k}]]"
first_sort_by_name = sorted(info.usage.union(current_uses))
if first_sort_by_name:
sorted_exceptions = sorted(
first_sort_by_name,
key=lambda x: pywikibot.Page(site, title=x).namespace().id,
)
msg += f" except on [[{']], [['.join(sorted_exceptions)}]]"
sorted_additions.append(msg)
print("\n".join(sorted(sorted_additions)))
# Usage / Exceptions
print(f"{INFO} Checking usage and exceptions")
exc: Dict[str, Dict[str, Set[str]]] = dict()
used_unexcepted: List[str] = []
for k, bil_list in image_by_name.items():
if k not in exc:
exc[k] = dict()
exc[k]["usage"] = set()
exc[k]["exceptions"] = set()
for bil in bil_list:
exc[k]["exceptions"].update(bil.exceptions)
for k, info in fileinfo.items():
if k not in exc:
exc[k] = dict()
exc[k]["usage"] = set()
exc[k]["exceptions"] = set()
exc[k]["usage"] = info.usage
for k, v in exc.items():
if v["usage"] - v["exceptions"]:
used_unexcepted.append(k)
if used_unexcepted:
print(f"{FAIL} Usage without exception found:")
for s in used_unexcepted:
ln = [str(bil.linenum) for bil in image_by_name[s]]
msg = f"-> {s} <- used on -> "
msg += f"{exc[s]['usage'] - exc[s]['exceptions']}"
msg += f" [{', '.join(ln)}]"
print(msg)
else:
print(f"{SUCCESS} No usage without exception found")
#