// AWB custom module to remove {{tl|Cleanup bare URLs}} when there are no remaining [[WP:Bare URLs|Bare URLs]]
// v0.07 18 October 2021
// -- BHG
// NOTE this version is hacked for testing purposes.
// It skips all pages except those which get to Step 5, then fail there.
public string botNV ()
{ string botName = "[[WP:BHGbot 9]]";
string botVersion = "0.07 checker";
string botTrial = " Trial";
// string botTrial = "";
return botName + "v" + botVersion + botTrial;
}
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
Skip = false;
Summary = botNV() + ": ";
// String DECLARATIONS
bool debugging = false;
string debuggingEditSummary = "This is a test to debug " + botNV() + ". This edit should not have been saved, so please revert it";
string successEditSummary = "Removed {{[[Template:Cleanup bare URLs|Cleanup bare URLs]]}}. This page currently has no bare URLs";
// article text variables
string nuArticleText = ""; // The text that we will return if the tag is removed.
string testArticleText = ArticleText; // A copy of the article which will be used for testing purposes
// tallies
int CleanupBareURLsTagCount = 0;
int bareURLinlineTagCount = 0;
int bareURLrefCount = 0;
int URLsremainingAfterRemovingNonBareURlsCount = 0;
// DECLARE some regexes needed later on
string CleanupBareURLsTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Cc]leanup[_ ]+bare[_ ]+URLs|[Bb]are[_ ]+|[Bb]are|[Bb]are[_ ]+link|[Bb]are[_ ]+linkname|[Bb]are[_ ]+links|[Bb]are[_ ]+references|[Bb]are[_ ]+refs|[Bb]are[_ ]+URL|[Bb]are[_ ]+URLs|[Bb]are-URLs|[Bb]arelinks|[Bb]areURL|[Bb]areURLs|[Cc]leanup[_ ]+bare-URLs|[Cc]leanup[_ ]+link[_ ]+rot|[Cc]leanup[_ ]+link-rot|[Cc]leanup-Bare[_ ]+URLs|[Cc]leanup-barelinks|[Cc]leanup-link[_ ]+rot|[Cc]leanup-link-rot|[Cc]leanup-linkrot|[Cc]UBURL|[Ll]ink[_ ]+rot|[Ll]INKROT|[Ll]R) *(\|[^\}]*)?\}\}";
string bareURLinlineTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Bb]are[_ ]+URL[\- ]inline|[Ll]inkrot-inline|[Bb]are-inline|[Bb]are[_ ]+inline|[Bb]are[_ ]+url[_ ]+inline|[Bb]are-url[_ ]+inline|[Bb]are[_ ]+link[_ ]+inline|[Bb]are-link-inline|[Bb]are-url-inline|[Bb]are[_ ]+url) *(\|[^\}]*)?\}\}";
string bareURLinlineRefMatcher = @"<ref[^>]*?>\s*\[?\s*https?:[^>< \|\[\]]+\s*\]?\s*<\s*/\s*ref\s*>";
string completeRefTagMatcher = @"<ref[^>]*?>[^<>]*<\s*/\s*ref\s*>";
string citeTemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Cc](ite|itation))[^\]\{]*\}\}"; // Yes, this is crude, and will miss some cases
// such as cites using {{sfnref}}, but it will do
// for a start
string URLtemplateMatcher = @"{\{ *([tT]emplate *: *)?(URL|Websites|URLWww|URLUrlw|URLUrl|URLUR|URLSite|URLWebsite|URLپیوند وب)\s*(\|[^\}]*)?\}\}";
string OfficialWebsiteOrOfficialURLtemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Oo]fficial[_ ]+URL|[Oo]fficial[_ ]+website|[Cc]onditionalURL|[Cc]onditional[_ ]+URL|[Gg]et[_ ]+URL[_ ]+from[_ ]+WikiData|[Oo]fficialURL|[Oo]fficial[_ ]+url|[Oo]fficialSite|[Oo]fficial|[Cc]ompany[_ ]+Website|[Oo]fficial[_ ]+site|[Oo]fficial[_ ]+Website|[Oo]ffficial[_ ]+website|[Oo]fficial[_ ]+web[_ ]+site|[Oo]fficial[_ ]+homepage|[Hh]omepage|[Hh]ome[_ ]+page|[Oo]fficialwebsite|[Mm]ain[_ ]+website|[Oo]fficialsite|[Oo]fficial[_ ]+webpage|[Oo]fficial[_ ]+Site|[Oo]web)\s*(\|[^\}]*)?\}\}";
string URLparameterMatcher = @"\|\s*(website|url)\s*=\s*https?:[^\|\}]*";
string nonBareURLMatcher = @"\[\s*https?://[^>< \|\[\]]+\s+[^\]]+\]"; // a bit crude
string BareURLMatcher = @"((?!<\[ *)https?://[^>< \|\[\]]+|\[ *https?:[^>< \|\[\]]+\s*\])"; // currently unused
string anyURLMatcher = @"(?!<\w)https?://\w"; // is this enough?
// STEP 1.check that the page contains the banner template {{Cleanup bare URLs}}, or one of its many aliases. If not, skip the page
MatchCollection CleanupBareURLsTagmatches = Regex.Matches(ArticleText, CleanupBareURLsTagMatcher, RegexOptions.Singleline);
CleanupBareURLsTagCount = CleanupBareURLsTagmatches.Count;
if (CleanupBareURLsTagCount == 0) {
// No {{CleanupBareURLsTagMatcher}} tags, so skip this page
if (debugging) {
Skip = false;
Summary = debuggingEditSummary;
return MakeDebugMsg(1, false, "Page contains no {{tl|CleanupBareURLsTagMatcher}} tag.", false, ArticleText);
}
Skip = true;
return ArticleText;
}
// So we have a {{Cleanup bare URLs}} tag
// Now create a copy of the page without the tag. This is what we will save if there are no remaining Bare URLs
nuArticleText = Regex.Replace(ArticleText, CleanupBareURLsTagMatcher, "", RegexOptions.Singleline);
// STEP 2. count the number of {{Bare URL inline}} tags in the page, including aliases
MatchCollection bareURLinlineTagmatches = Regex.Matches(ArticleText, bareURLinlineTagMatcher, RegexOptions.Singleline);
bareURLinlineTagCount = bareURLinlineTagmatches.Count;
// STEP 3. count the number of {{Bare URL inline}} tags in the page, including aliases
MatchCollection bareURLrefmatches = Regex.Matches(ArticleText, bareURLinlineRefMatcher, RegexOptions.Singleline);
bareURLrefCount = bareURLrefmatches.Count;
// STEP 4. if the total matches of step 2 + step 3 is greater than zero, then skip the page
if ((bareURLinlineTagCount + bareURLrefCount) > 0) {
// This page still has some bare URL refs, so skip this page
// No {{CleanupBareURLsTagMatcher}} tags, so skip this page
if (debugging) {
Skip = false;
Summary = debuggingEditSummary;
return MakeDebugMsg(4, false, "Page still has some bare URL refs.\n* bareURLinlineTagCount=" + bareURLinlineTagCount + "\n* bareURLrefCount=" + bareURLrefCount, false, ArticleText);
}
Skip = true;
return ArticleText;
}
// STEP 5. check for bare URLs not in ref tags
//
// In this step we proceed by working on a copy of the article from which we remove URls which are known to be OK
// Then we check whether any bare URLs remain
// STEP 5.A: remove all ref tags.
// We have already checked for any bare URLs inside ref tags, so we can just remove all ref tags and their contents.
testArticleText = Regex.Replace(testArticleText, completeRefTagMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 5.B: remove all {{cite}} templates.
// Anything inside a {{cite}} template is good, so just remove the whole template
testArticleText = Regex.Replace(testArticleText, citeTemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 5.C: remove any {{URL}} templates.
// Anything inside a {{URL}} template is good, so just remove the whole template
testArticleText = Regex.Replace(testArticleText, URLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 5.D: remove any {{Official URL}} or {{Official website}} templates.
// Anything inside an {{Official URL}} or {{Official website}} template is good, so just remove the whole template
testArticleText = Regex.Replace(testArticleText, OfficialWebsiteOrOfficialURLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 5.E: remove any URL which is a value of a template parameter "url=" or "website="
// e.g. "|website=https://example.com" or "|url=https://example.com"
testArticleText = Regex.Replace(testArticleText, URLparameterMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 5.F: remove any non-bare URLs
// e.g. "[https://example.com foo]"
testArticleText = Regex.Replace(testArticleText, nonBareURLMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
// STEP 6: does the page still contain any URLs?
MatchCollection RemainingURLsMatches = Regex.Matches(testArticleText, anyURLMatcher, RegexOptions.Singleline | RegexOptions.IgnoreCase);
URLsremainingAfterRemovingNonBareURlsCount = RemainingURLsMatches.Count;
if (URLsremainingAfterRemovingNonBareURlsCount == 0) {
// SUCCESS! No bare URLs, so we can remove the tag
if (debugging) {
Skip = false;
Summary = debuggingEditSummary;
return MakeDebugMsg(6, true, "Page contains no [[WP:Bare URLs]].", true, nuArticleText);
}
Skip = true;
Summary = botNV() + ": " + successEditSummary;
return ArticleText;
}
// FAILURE
// If we get here, then the page still contains bare URLs
Skip = false;
return "STEP 5 FAIL \n\n\n" +ArticleText;
}
public string MakeDebugMsg(int stepNum, bool testsOK, string debugMessage, bool textChanged, string pageText)
{
string retval = "DEBUGGING " + botNV() + ". --- This edit should NOT have been saved. Please revert.\n";
retval = retval + "\nSTEP: " + stepNum;
retval = retval + "\nSTATUS: ";
if (testsOK) {
retval = retval + "Success";
}
else {
retval = retval + "Fail";
}
retval = retval + "\nNOTES: " + debugMessage;
retval = retval + "\n\nArticle text follows below the line. ";
if (textChanged) {
retval = retval + "This text has been modified";
}
else {
retval = retval + "This is the original text, unmodified\n";
}
retval = retval + "\n\n____________________________________";
return retval;
}