public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
// global switches //////////////////////////////////////////////////////////
bool SaveSkipSummaries = false;
bool SaveSkipSummaries_FromOk = false;
bool SkipPagesLargerThanLimit = false; // used with int Limit
bool SaveOnlyPolbotPages = false;
bool ManuallyCheckPagesWithoutAnInfobox = false; // b/c some BLPs have unnecessary {{Taxonbar}}s
bool LiveDebug = false;
bool GenFixes = true; // summary text only
Skip = false;
// global-use vars //////////////////////////////////////////////////////////
int Limit = 2500; // characters/bytes on a page; used with bool SkipPagesLargerThanLimit
bool HTML1Attempted = false;
bool FromOk = false;
Summary = "";
// preliminary exceptions/error checking ////////////////////////////////////
if (SaveOnlyPolbotPages)
{
bool Polbot = Regex.IsMatch(ArticleText, @"\bPolbot\b", RegexOptions.IgnoreCase);
if (!Polbot)
{
Summary = @"!Polbot. ";
Skip = true;
}
}
if (SkipPagesLargerThanLimit)
{
string TooBig_Regex = @"^[\d\D]{" + (Limit + 1) + "}";
bool TooBig = Regex.IsMatch(ArticleText, TooBig_Regex);
if (TooBig)
{
Summary += "Too big (>" + Limit + "B). ";
Skip = true;
}
}
// check for inappropriate infoboxes
string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
bool BadInfobox1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
bool BadInfobox2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
if (BadInfobox1 || BadInfobox2)
{
Summary += @"Person/scientist infobox found. ";
Skip = true;
}
// check for appropriate infoboxes
string TitleTemplates_Regex = @"\{\{\s*(?:DISPLAY ?TITLE|[Ii]talicisedtitle|[Ii]talicised[ _]+title|[Ii]talicizedtitle|[Ii]talicized[ _]+title|[Ii]talicizetitle|[Ii]talicize[ _]+title|[Ii]talicstitle|[Ii]talics[ _]+title|[Ii]talics|ITALICTITLE|[Ii]talictitle|[Ii]talic[ _]+title[ _]+infobox|[Ii]talic[ _]+title|[Ii]talic|[Ii]tal|[Rr]edirect[ _]+italic[ _]+title|[Tt]itle[ _]+italic)";
string TaxoTemplates_Regex = @"\{\{\s*(?:Template:\s*|Wikipedia:\s*)?(?:Infobox[ _]+)?(" + // prefixes
@"Taxobox|Taxo|TX|Speciesbox|Subspeciesbox|Infraspeciesbox|" + // taxo/species
@"Automatic[ _]+t?axobox|" + // auto
@"bacteria|microorganism|virus|oobox" + // other
@")(?=\s*(?:\||\<\!\-\-|" + TitleTemplates_Regex + @"|(?<=Automatic[ _]+t?axobox\s*)\}\}))"; // suffixes
bool NoTaxoTemplates = !Regex.IsMatch(ArticleText, TaxoTemplates_Regex, RegexOptions.IgnoreCase);
if (NoTaxoTemplates)
{
if (ManuallyCheckPagesWithoutAnInfobox)
{
if (!BadInfobox1 && !BadInfobox2)
{
// OK to proceed (manually)
}
else
{
// Skip is already true from 'inappropriate infoboxes' check
}
}
else
{
Summary += @"No auto/taxo/speciesbox found. ";
Skip = true;
}
}
// standardize & check for {{Taxonbar
string TaxonbarAliases_Regex = @"\{\{\s*(?:[Tt]axobar|[Tt]axon\-bar|[Tt]axonbar|[Tt]axonBar|[Tt]axonIds|[Tt]axon[ _]+bar)(?=\s*[\|\}])"; // 0 grps
ArticleText = Regex.Replace(ArticleText, TaxonbarAliases_Regex, @"{{Taxonbar", RegexOptions.IgnoreCase);
int iTaxonbars = Regex.Matches(ArticleText, @"\{\{Taxonbar", RegexOptions.IgnoreCase).Count;
if (iTaxonbars != 1)
{
Summary += @"Unexpected # of taxonbars: " + iTaxonbars + ". ";
Skip = true;
}
// store all {{Taxonbar...}} contents ffr
string TBAll_Regex = @"\{\{Taxonbar([^\{\}]*)\}\}";
Match mTBAll = Regex.Match(ArticleText, TBAll_Regex, RegexOptions.IgnoreCase);
string TBAll = mTBAll.Value;
bool BracketsInTB = (iTaxonbars > 0 && !mTBAll.Success);
if (BracketsInTB)
{
Summary += @"Stray bracket(s) found in taxonbar. "; // can't add {{Taxonbar...}} to skip summary b/c mTBAll failed!
Skip = true;
}
// get wikibase_item via WP API
// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
// wish I could find a URL_Encode function that worked....
string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" +
ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
string HTML1 = "";
bool HTML1Failed = false;
if (!Skip)
{
HTML1Attempted = true;
try
{
HTML1 = Tools.GetHTML(URL1);
}
catch
{
HTML1Failed = true;
Summary = "HTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + ". ";
if (!LiveDebug) Skip = true;
}
}
// WD/html error checks /////////////////////////////////////////////////////
string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
if (string.IsNullOrEmpty(QID) && !Skip)
{
Summary = @"QID retrieval failed. ";
Skip = true;
}
if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip) // case sensitive, jtbs
{
Summary = @"Unexpected QID format. ";
Skip = true;
}
bool CorrectExistingFrom1 = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=\s*" + QID + @"\s*[\|\}]", RegexOptions.IgnoreCase);
if (CorrectExistingFrom1 && !Skip)
{ // this should be the normal/most frequent skip case
Summary = @"From1 already exists (correct) in " + TBAll + ". ";
FromOk = true;
Skip = true;
}
bool AnyFrom1 = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=", RegexOptions.IgnoreCase);
bool NullFrom1 = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=\s*[\|\}]", RegexOptions.IgnoreCase);
if (AnyFrom1 && !NullFrom1 && !CorrectExistingFrom1)
{
Summary += @"From1 already exists (incorrect)";
if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
else Summary += " in " + TBAll + ". ";
Skip = true;
}
bool DupQ = (Regex.Matches(TBAll, @"\b" + QID + @"\b", RegexOptions.IgnoreCase).Count > 1); // case INsensitive
if (DupQ && !Skip)
{
Summary += @"Duplicate """ + QID + @"""s found";
if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
else Summary += " in " + TBAll + ". ";
Skip = true;
}
bool NonEmptyTB = !string.IsNullOrEmpty(mTBAll.Groups[1].Value.Trim());
if (NonEmptyTB && !CorrectExistingFrom1 && HTML1Attempted)
{
bool NoFrom1 = !Regex.IsMatch(TBAll, @"\|\s*from1?\s*=", RegexOptions.IgnoreCase);
if (NoFrom1)
{
// OK to proceed
}
else if (NullFrom1)
{ // remove it
string TBAll_new = Regex.Replace(TBAll, @"\|\s*from1?\s*=\s*(?=[\|\}])", "", RegexOptions.IgnoreCase);
ArticleText = ArticleText.Replace(TBAll, TBAll_new);
}
else
{
Summary += @"Extra text in TBAll; batch these more carefully later";
if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
else Summary += @": " + TBAll + ". ";
Skip = true;
}
}
// main /////////////////////////////////////////////////////////////////////
if (!Skip)
{
bool AnyFrom2 = Regex.IsMatch(TBAll, @"\|\s*from2\s*=", RegexOptions.IgnoreCase);
string One = (AnyFrom2) ? "1" : ""; // only use from1 if from2 exists, otherwise from
string Pipe = Regex.Match(TBAll, @"\{\{Taxonbar(\s*\|\s*)").Groups[1].Value;
string Equals = Regex.Match(TBAll, @"\s*=\s*").Value;
if (string.IsNullOrEmpty(Pipe)) Pipe = "|";
if (string.IsNullOrEmpty(Equals)) Equals = "=";
string From1 = Pipe + "from" + One + Equals + QID;
ArticleText = Regex.Replace(ArticleText, @"(\{\{Taxonbar)(?=\s*[\|\}])", @"$1" + From1);
Summary = @"[[Template talk:Taxonbar#from1|Add from]]=[[d:Special:EntityPage/" + QID + @"|" + QID + @"]] to {{[[Template:Taxonbar|Taxonbar]]}}";
// remove "form" typo if QIDs match
string FormTypo1_Regex = @"(\{\{Taxonbar\s*\|\s*from1?\s*=\s*" + QID + @"\s*)\|\s*form1?\s*=\s*" + QID + @"\s*(?=\|)"; // 1 grp
string FormTypo2_Regex = @"(\{\{Taxonbar\s*\|\s*from1?\s*=\s*" + QID + @")\s*\|\s*form1?\s*=\s*" + QID + @"\s*(?=\}\})"; // 1 grp
bool FormTypo1 = Regex.IsMatch(ArticleText, FormTypo1_Regex, RegexOptions.IgnoreCase);
bool FormTypo2 = Regex.IsMatch(ArticleText, FormTypo2_Regex, RegexOptions.IgnoreCase);
ArticleText = Regex.Replace(ArticleText, FormTypo1_Regex, @"$1", RegexOptions.IgnoreCase);
ArticleText = Regex.Replace(ArticleText, FormTypo2_Regex, @"$1", RegexOptions.IgnoreCase);
if (FormTypo1 || FormTypo2) Summary += @" (|form= typo)";
if (GenFixes) Summary += @"; [[WP:GenFixes]] on";
Summary += ",";
}
// exception tracking ///////////////////////////////////////////////////////
if (Skip && SaveSkipSummaries)
{
if (!FromOk || (FromOk && SaveSkipSummaries_FromOk))
{
string Message = ArticleTitle + "\t" + Summary + "\n";
string File = @"Module output - Add from1 parameter (skip summaries).txt";
string Path = @"F:\"; // desktop
string FullPath = Path + File;
const bool APPEND = true;
Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
}
}
return ArticleText;
}