This is a crude AWB custom module that might be used to find and fix previously identified spelling errors in the value assigned to |language=
.
Custom module
editpublic string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
Skip = false;
Summary = "CS1 fixes; "; // TR necessary to have here; moved text below "Regex.Match" to make appending conditional
bool changes_made = false; // TR to make appending to Summary conditional
bool ndash = false;
string pattern; // local variable to hold regex pattern for reuse
string IS_CS1 = @"(?:[Cc]ite[_ ](?=(?:(?:AV|av) [Mm]edia(?: notes)?)|article|blog|book|conference|document|(?:DVD|dvd)(?: notes)?|encyclopa?edia|interview|journal|letter|[Mm]agazine|(?:news(?!group|paper))|paper|podcast|press release|sign|speech|techreport|thesis|video|web)|[Cc]itation|[Cc]ite(?=\s*\|))";
//---------------------------< M I S S P E L L I N G D I C T I O N A R Y >----------------------------------
// This is a crude dictionary of misspellings. For each item, the first is the misspelling, the second is the correct spelling
// This dictionary can also be used to remove 'qualifiers' (eg Portuguese (Brazil) to Portuguese because Portuguese (Brazil) is
// not an ISO 639-1 language). Dictionary misspellings should be lower case only because the code sets all language values to
// lower case before it searches the dictionary.
Dictionary<string, string> spelling_map = new Dictionary<string, string>();
spelling_map.Add("albainian", "Albanian");
spelling_map.Add("al", "Albanian");
spelling_map.Add("albania", "Albanian");
spelling_map.Add("alemán", "German");
spelling_map.Add("american english", "");
spelling_map.Add("american folklore society", "");
spelling_map.Add("angol", "");
spelling_map.Add("arabic (kuwait)", "Arabic");
spelling_map.Add("arcs", "");
spelling_map.Add("argentina", "Spanish");
spelling_map.Add("australian", "");
spelling_map.Add("austrian", "German");
spelling_map.Add("austrian german", "German");
spelling_map.Add("austrian-german", "German");
spelling_map.Add("azerbajani", "Azerbaijani");
spelling_map.Add("azerbaycani", "Azerbaijani");
spelling_map.Add("azerbaijan", "Azerbaijani");
spelling_map.Add("azeri", "Azerbaijani");
spelling_map.Add("bahasa", ""); // bahasa = 'language'
spelling_map.Add("bahasa indonesia", "Indonesian");
spelling_map.Add("bahasa indonesian", "Indonesian");
spelling_map.Add("bahasa inggris", ""); // English
spelling_map.Add("bahasa malaysia", "Malaysian");
spelling_map.Add("bangla", "Bengali");
spelling_map.Add("bbc", "");
spelling_map.Add("belarussian", "Belarusian");
spelling_map.Add("belorussian", "Belarusian");
spelling_map.Add("book", "");
spelling_map.Add("braille", ""); // writing system, not a language
spelling_map.Add("brazilian", "Portuguese");
spelling_map.Add("brazilian portuguese", "Portuguese");
spelling_map.Add("canadian english", "");
spelling_map.Add("canadian french", "French");
spelling_map.Add("castellano", "Spanish");
spelling_map.Add("castellà", "Spanish");
spelling_map.Add("castilan", "Spanish");
spelling_map.Add("castillan", "Spanish");
spelling_map.Add("castilian", "Spanish");
spelling_map.Add("castilian spanish", "Spanish");
spelling_map.Add("castillian", "Spanish");
spelling_map.Add("castillian (spanish)", "Spanish");
spelling_map.Add("catalano", "Catalan");
spelling_map.Add("catalán", "Catalan");
spelling_map.Add("català", "Catalan");
spelling_map.Add("china", "Chinese");
spelling_map.Add("china times", "");
spelling_map.Add("chinese simp.", "Chinese");
spelling_map.Add("chinese.", "Chinese");
spelling_map.Add("chinese (simplified han)", "Chinese");
spelling_map.Add("chinese(traditional)", "Chinese");
spelling_map.Add("chinese (traditional)", "Chinese");
spelling_map.Add("chinewe", "Chinese");
spelling_map.Add("classical chinese", "Chinese");
spelling_map.Add("cn", "Chinese");
spelling_map.Add("costa rica", "Spanish");
spelling_map.Add("cricinfo", "");
spelling_map.Add("croation", "Croatian");
spelling_map.Add("cyrillic", "");
spelling_map.Add("cz", "cs");
spelling_map.Add("czec", "Czech");
spelling_map.Add("czecg", "Czech");
spelling_map.Add("danis", "Danish");
spelling_map.Add("deutsch", "German");
spelling_map.Add("dhivehi", "Divehi");
spelling_map.Add("dansih", "Danish");
spelling_map.Add("dansk", "Danish");
spelling_map.Add("denmark", "Danish");
spelling_map.Add("dk", "Danish");
spelling_map.Add("du", "Dutch");
spelling_map.Add("duth", "Dutch");
spelling_map.Add("duthc", "Dutch");
spelling_map.Add("en_au", "");
spelling_map.Add("en-au", "");
spelling_map.Add("en-gb", "");
spelling_map.Add("en-us", "");
spelling_map.Add("eng", "");
spelling_map.Add("eng.", "");
spelling_map.Add("engl", "");
spelling_map.Add("engliah", "");
spelling_map.Add("englisch", "");
spelling_map.Add("englis", "");
spelling_map.Add("english", "");
spelling_map.Add("english edition", "");
spelling_map.Add("english (US)", "");
spelling_map.Add("english, u.k.", "");
spelling_map.Add("englısh", ""); // i without a dot: ı
spelling_map.Add("english translation", "");
spelling_map.Add("english trans. by k. k. dixit", "|others=trans. by K. K. Dixit");
spelling_map.Add("english (american)", "");
spelling_map.Add("english (british styled)", "");
spelling_map.Add("english (british-style pakistani english)", "");
spelling_map.Add("english (pakistan)", "");
spelling_map.Add("eng;ish", "");
spelling_map.Add("erbian", "Serbian");
spelling_map.Add("español", "Spanish");
spelling_map.Add("espanol", "Spanish");
spelling_map.Add("espanhol", "Spanish");
spelling_map.Add("estonia", "Estonian");
spelling_map.Add("euskara", "Basque");
spelling_map.Add("faeroese", "Faroese");
spelling_map.Add("færøysk", "Faroese");
spelling_map.Add("farsi", "Persian");
spelling_map.Add("fgerman", "German");
spelling_map.Add("finis", "Finnish");
spelling_map.Add("finish", "Finnish");
spelling_map.Add("finnisg", "Finnish");
spelling_map.Add("foreword", "");
spelling_map.Add("francais", "French");
spelling_map.Add("français", "French");
spelling_map.Add("france", "French");
spelling_map.Add("francés", "French");
spelling_map.Add("fre", "French");
spelling_map.Add("frenc", "French");
spelling_map.Add("frence", "French");
spelling_map.Add("frencg", "French");
spelling_map.Add("french (abstract)", "French");
spelling_map.Add("gaeilge", "Irish");
spelling_map.Add("gaeilge, [ga]", "Irish");
spelling_map.Add("gallego", "Galician");
spelling_map.Add("ge", "Georgian");
spelling_map.Add("ger", "German");
spelling_map.Add("geraman", "German");
spelling_map.Add("germaan", "German");
spelling_map.Add("germany", "German");
spelling_map.Add("germană", "German");
spelling_map.Add("german (swiss)", "German");
spelling_map.Add("german-", "German");
spelling_map.Add("germna", "German");
spelling_map.Add("gernan", "German");
spelling_map.Add("greece", "Greek");
spelling_map.Add("greenlandic", "Kalaallisut");
spelling_map.Add("hangul", "Korean");
spelling_map.Add("hn", "Spanish");
spelling_map.Add("honduran spanish", "Spanish");
spelling_map.Add("hungary", "Hungarian");
spelling_map.Add("imgartists.com", "");
spelling_map.Add("indonesia", "Indonesian");
spelling_map.Add("inglês", "");
spelling_map.Add("inglés", "");
spelling_map.Add("ingles", "");
spelling_map.Add("islandic", "Icelandic");
spelling_map.Add("israel", "Hebrew");
spelling_map.Add("irsaeli", "Hebrew");
spelling_map.Add("israeli", "Hebrew");
spelling_map.Add("ilalian", "Italian");
spelling_map.Add("italiain", "Italian");
spelling_map.Add("italic", "Italian");
spelling_map.Add("italics", "Italian");
spelling_map.Add("italien", "Italian");
spelling_map.Add("italian/milanese dialect", "Italian");
spelling_map.Add("italin", "Italian");
spelling_map.Add("italina", "Italian");
spelling_map.Add("italiano", "Italian");
spelling_map.Add("italy", "Italian");
spelling_map.Add("itunes", "");
spelling_map.Add("japanaese", "Japanese");
spelling_map.Add("japaneses", "Japanese");
spelling_map.Add("japanese)", "Japanese");
spelling_map.Add("japonês", "Japanese");
spelling_map.Add("japones", "Japanese");
spelling_map.Add("japonese", "Japanese");
spelling_map.Add("javanesse", "Javanese");
spelling_map.Add("javascript", "");
spelling_map.Add("jp", "ja");
spelling_map.Add("jpn", "ja");
spelling_map.Add("jspanese", "Japanese");
spelling_map.Add("kannaḍa", "Kannada");
spelling_map.Add("kiswahili", "Swahili");
spelling_map.Add("koeran", "Korean");
spelling_map.Add("koṅkaṇī", "Konkani");
spelling_map.Add("korea", "Korean");
spelling_map.Add("koreai", "Korean");
spelling_map.Add("koream", "Korean");
spelling_map.Add("korean=", "Korean");
spelling_map.Add("koren", "Korean");
spelling_map.Add("language", "");
spelling_map.Add("lat", "Latin"); // not Latvian
spelling_map.Add("latin (original citation)", "Latin");
spelling_map.Add("latín", "Latin");
spelling_map.Add("lecture", "");
spelling_map.Add("legalese", "");
spelling_map.Add("lietuvių k.", "Lithuanian");
spelling_map.Add("lithusanian", "Lithuanian");
spelling_map.Add("magyar", "Hungarian");
spelling_map.Add("malayalam)", "Malayalam");
spelling_map.Add("mandarin", "Chinese");
spelling_map.Add("mandarin chinese", "Chinese");
spelling_map.Add("many", "");
spelling_map.Add("manuscript latin", "Latin");
spelling_map.Add("marāṭhī", "Marathi");
spelling_map.Add("mexican", "Spanish");
spelling_map.Add("mexico city", "Spanish");
spelling_map.Add("mixed", "");
spelling_map.Add("modern russian", "Russian");
spelling_map.Add("mongol", "Mongolian");
spelling_map.Add("multiple", "");
spelling_map.Add("multiplelanguages", "");
spelling_map.Add("multiple languages", "");
spelling_map.Add("mx", "Spanish");
spelling_map.Add("nepal bhasa", "Newar"); //639-3 new
spelling_map.Add("netherlands", "Dutch");
spelling_map.Add("norge", "Norwegian");
spelling_map.Add("norsk", "Norwegian");
spelling_map.Add("norsk (bokmål)", "Norwegian Bokmål");
spelling_map.Add("northern sámi", "Northern Sami");
spelling_map.Add("norway", "Norwegian");
spelling_map.Add("norwegain", "Norwegian");
spelling_map.Add("norwegian", "Norwegian"); // because of bug in module, since fixed
spelling_map.Add("norwegian bokmal", "Norwegian Bokmål");
spelling_map.Add("norwegian nynorsk", "Norwegian Nynorsk");
spelling_map.Add("norweigen", "Norwegian");
spelling_map.Add("norweigian", "Norwegian");
spelling_map.Add("norwergian", "Norwegian");
spelling_map.Add("norwgian", "Norwegian");
spelling_map.Add("pay-per-view", "");
spelling_map.Add("pay=per-view", "");
spelling_map.Add("pay-per=view", "");
spelling_map.Add("persian (farsi)", "Persian");
spelling_map.Add("pdf", "");
spelling_map.Add("pol", "Polish");
spelling_map.Add("polis", "Polish");
spelling_map.Add("polish2", "Polish");
spelling_map.Add("polishi", "Polish");
spelling_map.Add("polsih", "Polish");
spelling_map.Add("portguês", "Portuguese");
spelling_map.Add("portughese", "Portuguese");
spelling_map.Add("portugues", "Portuguese");
spelling_map.Add("portugués", "Portuguese");
spelling_map.Add("portufuês", "Portuguese");
spelling_map.Add("português", "Portuguese");
spelling_map.Add("portugugese", "Portuguese");
spelling_map.Add("portuagese", "Portuguese");
spelling_map.Add("porutguese", "Portuguese");
spelling_map.Add("portuguese (brasil)", "Portuguese");
spelling_map.Add("portuguese (brazil)", "Portuguese");
spelling_map.Add("potuguese", "Portuguese");
spelling_map.Add("projekt records", "");
spelling_map.Add("requires subscription", "");
spelling_map.Add("romanina", "Romanian");
spelling_map.Add("română", "Romanian");
spelling_map.Add("rurkish", "Turkish");
spelling_map.Add("ruassian", "Russian");
spelling_map.Add("rus", "Russian");
spelling_map.Add("rus.", "Russian");
spelling_map.Add("russia", "Russian");
spelling_map.Add("russian (translated)", "Russian");
spelling_map.Add("sanish", "Spanish");
spelling_map.Add("saṃskṛta", "Sanskrit");
spelling_map.Add("sbrj", "");
spelling_map.Add("self-published", "");
spelling_map.Add("serbia", "Serbian");
spelling_map.Add("serbian cyrillic", "Serbian");
spelling_map.Add("serbian [translated]", "Serbian");
spelling_map.Add("serbocroatian", "Serbo-Croatian");
spelling_map.Add("serb-croatian", "Serbo-Croatian");
spelling_map.Add("serbo - croatian", "Serbo-Croatian"); //only whitespace
spelling_map.Add("several", "");
spelling_map.Add("shift jis", "ja");
spelling_map.Add("([[shift jis]])", "ja");
spelling_map.Add("shqip", "Albanian");
spelling_map.Add("singapore", "");
spelling_map.Add("sinhalese", "Sinhala");
spelling_map.Add("simplified chinese", "Chinese");
spelling_map.Add("slovakian", "Slovak");
spelling_map.Add("slovene", "Slovenian");
spelling_map.Add("slovene, with a summary in english", "Slovenian");
spelling_map.Add("slovene [slovene biographical encyclopedia]", "Slovenian");
spelling_map.Add("slovene [slovene biographical lexicon]", "Slovenian");
spelling_map.Add("slovenia", "Slovenian");
spelling_map.Add("slovenian language", "Slovenian");
spelling_map.Add("somalian", "Somali");
spelling_map.Add("spain", "Spanish");
spelling_map.Add("spainsh", "Spanish");
spelling_map.Add("spanihs", "Spanish");
spelling_map.Add("spanis", "Spanish");
spelling_map.Add("spansih", "Spanish");
spelling_map.Add("spanishh", "Spanish");
spelling_map.Add("spanish=", "Spanish");
spelling_map.Add("spanish (appendix only)", "Spanish");
spelling_map.Add("spanish; castilian", "Spanish");
spelling_map.Add("\"spanish (argentina)\"", "Spanish");
spelling_map.Add("spanish (castilian)", "Spanish");
spelling_map.Add("spanish, español", "Spanish");
spelling_map.Add("spanish.", "Spanish");
spelling_map.Add("suomi", "Finnish");
spelling_map.Add("surabaya", "Indonesian");
spelling_map.Add("svensk", "Swedish");
spelling_map.Add("svenska", "Swedish");
spelling_map.Add("swe", "Swedish");
spelling_map.Add("sweden", "Swedish");
spelling_map.Add("swedieh", "Swedish");
spelling_map.Add("swedis", "Swedish");
spelling_map.Add("swedish)", "Swedish");
spelling_map.Add("swedisy", "Swedish");
spelling_map.Add("swiss german", "German");
spelling_map.Add("taiwanese", "Chinese");
spelling_map.Add("telgu", "Telugu");
spelling_map.Add("traditional chinese", "Chinese");
spelling_map.Add("traditional han chinese", "Chinese");
spelling_map.Add("translated", "");
spelling_map.Add("(translated)", "");
spelling_map.Add("tu", "Turkish");
spelling_map.Add("tuekish", "Turkish");
spelling_map.Add("turjish", "Turkish");
spelling_map.Add("turish", "Turkish");
spelling_map.Add("turkihs", "Turkish");
spelling_map.Add("turkis", "Turkish");
spelling_map.Add("turksh", "Turkish");
spelling_map.Add("turksih", "Turkish");
spelling_map.Add("turkşsh", "Turkish");
spelling_map.Add("türkçe", "Turkish");
spelling_map.Add("ua", "uk");
spelling_map.Add("ucalgary", "");
spelling_map.Add("unidentified", "");
spelling_map.Add("ukraian", "Ukrainian");
spelling_map.Add("ukrainan", "Ukrainian");
spelling_map.Add("uk english", "");
spelling_map.Add("urkish", "Turkish");
spelling_map.Add("us", "");
spelling_map.Add("us english", "");
spelling_map.Add("valenciano", "Valencian");
spelling_map.Add("various", "");
spelling_map.Add("vietnamise", "Vietnamese");
spelling_map.Add("vn", "Vietnamese");
spelling_map.Add("weeds", "");
spelling_map.Add("wessa alien plants", "");
spelling_map.Add("west frisian", "Western Frisian");
spelling_map.Add("zh-hans", "Chinese");
spelling_map.Add("zh=hans", "Chinese");
spelling_map.Add("македонски", "Macedonian");
spelling_map.Add("-", "");
//---------------------------< M I S C F I X E S >----------------------------------------------------------
// replace {{spaced ndash}} templates with ' – '
while (Regex.Match (ArticleText, @"\{\{\s*" + IS_CS1 + @"[^}]*\{\{\s*(?:spaced\s*ndash|snd)\s*\}\}").Success)
{
ArticleText = Regex.Replace(ArticleText, @"(\{\{\s*" + IS_CS1 + @"[^\{\}]*)\{\{\s*spaced\s*ndash\s*\}\}\s*", "$1 – ");
ndash = true;
}
// replace {{xx icon}} templates with xx within CS1 templates
while (Regex.Match (ArticleText, @"\{\{\s*" + IS_CS1 + @"[^}]*\{\{\s*[a-z]{2}\s*icon\s*\}\}").Success)
{
ArticleText = Regex.Replace(ArticleText, @"(\{\{\s*" + IS_CS1 + @"[^\{\}]*)\{\{\s*([a-z]{2})\s*icon\s*\}\}", "$1$2");
}
// When |language=In <language>, remove leading punctuation
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)[\-\.,;–—]+\s*([^\|\}]+)", "$1$2");
// When |language=In <language>, remove 'In ' (space is required)
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)[Ii]n ([^\|\}]+)", "$1$2");
// When |language='''<language>''', remove bold wikimarkup
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)'''([a-zA-Z\s\-]+)'''", "$1$2");
// When |language=''<language>'', remove italic wikimarkup
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)''([a-zA-Z\s\-]+)''", "$1$2");
// DATES
// When |language=<language name> where <language name> is a mdy date, remove it
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+)\|\s*language\s*=\s*[a-zA-Z]+\s*\d\d?,\s*\d{4}", "$1");
// When |language=<language name> where <language name> is a dmy date, remove it
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+)\|\s*language\s*=\s*\d\d?\s*[a-zA-Z]+\s*\d{4}", "$1");
// When |language=<language name> where <language name> is a my date, remove it
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+)\|\s*language\s*=\s*[a-zA-Z]+\s*\d{4}", "$1");
// When |language=<language name> where <language name> is numeric or y-m-d style date, remove it
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+)\|\s*language\s*=\s*\d[\d\s\-]*", "$1");
// WIKILINKS: Remove simple wikilinks from |language parameters because they prevent proper categorization
// Replace [[Text]] or ([[text]]) with Text
pattern = @"(\{\{\s*" +IS_CS1 + @"[^\}]*\|\s*language\s*=\s*)\(?\[\[([A-Za-zá\s]+)\]\]\)?";
ArticleText = Regex.Replace(ArticleText, pattern, "$1$2");
// WIKILINKS: Remove complex wikilinks from |language parameters because they prevent proper categorization
// Replace [[Article|Text]] or ([[Article|Text]]) with Text
pattern = @"(\{\{\s*" +IS_CS1 + @"[^\}]*\|\s*language\s*=\s*)\(?\[\[[A-Za-zá\s\(\)]+\|([A-Za-zá\s]+)\]\]\)?";
ArticleText = Regex.Replace(ArticleText, pattern, "$1$2");
// WIKILINKS: Remove complex wikilinks in the form [[xxx{{!}}xxx]] from |language parameters because they prevent proper categorization
// Replace [[Article|Text]] with Text
pattern = @"(\{\{\s*" +IS_CS1 + @"[^\}]*\|\s*language\s*=\s*)\[\[[A-Za-z\s\(\)]+\{\{!\}\}([A-Za-z\s]+)\]\]";
ArticleText = Regex.Replace(ArticleText, pattern, "$1$2");
// When |language=<language name> language, remove language spelling_map.Add("简体中文", "Chinese"); // simplified chinese
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)([a-zA-Z\s\-]+) languages?", "$1$2");
// When |language={{xx icon, without closing }} remove icon text
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)\{\{([a-zA-Z]{2})\s*icon", "$1$2");
// SIMPLIFIED CHINESE
// When |language=简体中文
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)简体中文", "$1Chinese");
// THAI
// When |language=ไทย
ArticleText = Regex.Replace(ArticleText, @"({{\s*" + IS_CS1 + @"[^}]+\|\s*language\s*=\s*)ไทย", "$1Chinese");
//---------------------------< M I S S P E L L I N G S >------------------------------------------------------
// MISSPELLINGS: Fix misspellings in |language=<value> where <value> is misspelled.
pattern = @"({{\s*" + IS_CS1 + @"[^}]*\|\s*language\s*=\s*)([^\|\}]*)";
if (Regex.Match (ArticleText, pattern).Success)
{
ArticleText = Regex.Replace(ArticleText, pattern,
delegate(Match match)
{
string new_spelling;
string return_string = match.Groups[0].Value; // no misspelling, return the raw string
try // get correct spelling from dictionary
{
new_spelling = spelling_map[match.Groups[2].Value.Trim().ToLower()]; // will throw an exception if language <value> (key) is not found in dictionary (presumed correct)
changes_made = true; // TR Summary can't be changed here; need a dummy variable
}
catch (KeyNotFoundException) // trap the exception
{
return return_string; // return the raw string
}
return match.Groups[1].Value + new_spelling;
});
}
if (true == changes_made)
Summary += " |language= spelling;"; // TR
if (true == ndash)
Summary += " remove {{spaced ndash}};"; // TR
return ArticleText;
}