Wikipedia:WikiProject User scripts/Scripts/Formatter
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
Skip = false;
Summary = "";
ArticleText = catFixer(ArticleText);
ArticleText = entities(ArticleText);
ArticleText = fixheadings(ArticleText);
ArticleText = fixsyntax(ArticleText);
ArticleText = linkfixer(ArticleText, false);
//ArticleText = imagefixer(ArticleText);
ArticleText = whitespace(ArticleText);
ArticleText = trim(ArticleText);
ArticleText = trim(ArticleText);
return ArticleText;
}
private string whitespace(string ArticleText)
{
ArticleText = Regex.Replace(ArticleText, @"/\t/g", " ");
ArticleText = Regex.Replace(ArticleText, @"/^ ? ? \n/gm", "\n");
ArticleText = Regex.Replace(ArticleText, @"/(\n\n)\n+/g", "$1");
ArticleText = Regex.Replace(ArticleText, @"/== ? ?\n\n==/g", "==\n==");
ArticleText = Regex.Replace(ArticleText, @"/\n\n(\* ?\[?http)/g", "\n$1");
ArticleText = Regex.Replace(ArticleText, @"/^ ? ? \n/gm", "\n");
ArticleText = Regex.Replace(ArticleText, @"/\n\n\*/g", "\n*");
ArticleText = Regex.Replace(ArticleText, @"/[ \t][ \t]+/g", " ");
ArticleText = Regex.Replace(ArticleText, @"/([=\n]\n)\n+/g", "$1");
ArticleText = Regex.Replace(ArticleText, @"/ \n/g", "\n");
//* bullet points
ArticleText = Regex.Replace(ArticleText, @"/^([\*#]+) /gm", "$1");
ArticleText = Regex.Replace(ArticleText, @"/^([\*#]+)/gm", "$1 ");
//==Headings==
ArticleText = Regex.Replace(ArticleText, @"/^(={1,4}) ?(.*?) ?(={1,4})$/gm", "$1$2$3");
//dash — spacing
ArticleText = Regex.Replace(ArticleText, @"/ ?(–|–|–|–|–) ?/g", "$1");
ArticleText = Regex.Replace(ArticleText, @"/ ?(—|—|—|—|—) ?/g", "$1");
ArticleText = Regex.Replace(ArticleText, @"/([^1-9])(—|—|—|—|—|–|–|–|–|–)([^1-9])/g", "$1 $2 $3");
return trim(ArticleText);
}
private string entities(string ArticleText)
{
//ArticleText = Regex.Replace(ArticleText, @"//g", "");
ArticleText = Regex.Replace(ArticleText, @"/–|–|–/g", "–");
ArticleText = Regex.Replace(ArticleText, @"/—|—|—/g", "—");
// ArticleText = Regex.Replace(ArticleText, @"/(cm| m|km|mi)<sup>2</sup>/g", "$1²");
ArticleText = Regex.Replace(ArticleText, @"/²/g", "²");
ArticleText = Regex.Replace(ArticleText, @"/°/g", "°");
return trim(ArticleText);
}
//Fix ==See also== and similar section common errors.
private string fixheadings(string ArticleText)
{
if (!Regex.Match(ArticleText, "/= ?See also ?=/").Success)
ArticleText = Regex.Replace(ArticleText, "/(== ?)(see also:?|related topics:?|related articles:?|internal links:?|also see:?)( ?==)/gi", "$1See also$3");
ArticleText = Regex.Replace(ArticleText, "/(== ?)(external links?:?|outside links?|web ?links?:?|exterior links?:?)( ?==)/gi", "$1External links$3");
ArticleText = Regex.Replace(ArticleText, "/(== ?)(references?:?)( ?==)/gi", "$1References$3");
ArticleText = Regex.Replace(ArticleText, "/(== ?)(sources?:?)( ?==)/gi", "$1Sources$3");
ArticleText = Regex.Replace(ArticleText, "/(== ?)(further readings?:?)( ?==)/gi", "$1Further reading$3");
return ArticleText;
}
private string catFixer(string ArticleText)
{
ArticleText = Regex.Replace(ArticleText, @"/\[\[ ?[Cc]ategory ?: ?/g", "[[Category:");
return trim(ArticleText);
}
//fixes many common syntax problems
private string fixsyntax(string ArticleText)
{
//replace html with wiki syntax
if (!Regex.Match(ArticleText, @"/'<\/?[ib]>|<\/?[ib]>'/gi").Success)
{
ArticleText = Regex.Replace(ArticleText, @"/<i>(.*?)<\/i>/gi", "''$1''");
ArticleText = Regex.Replace(ArticleText, @"/<b>(.*?)<\/b>/gi", "'''$1'''");
}
ArticleText = Regex.Replace(ArticleText, @"/<br\/>/gi", "<br />");
ArticleText = Regex.Replace(ArticleText, @"/<br>/gi", "<br />");
return trim(ArticleText);
}
//formats links in standard fashion
private string linkfixer(string ArticleText, bool checkImages)
{
ArticleText = Regex.Replace(ArticleText, @"/\]\[/g", "] [");
Match m = Regex.Match(ArticleText, @"/\[?\[[^\]]*?\]\]?/g");
if (m.Success)
{
for (int i = 0; i < m.Length; i++)
{
string x = m.Groups[i].ToString();
string y = x;
//internal links only
if (!Regex.Match(y, @"/^\[?\[http:\/\//i").Success && !Regex.Match(y, @"/^\[?\[image:/i").Success)
{
if (y.IndexOf(":") == -1 && y.Substring(0, 3) != "[[_" && y.IndexOf("|_") == -1)
{
if (y.IndexOf("|") == -1)
y = Regex.Replace(y, @"/_/g", " ");
else
y = y.Replace(y.Substring(0, y.IndexOf("|")), Regex.Replace(y.Substring(0, y.IndexOf("|")), "/_/g", " "));
}
y = Regex.Replace(y, @"/ ?\| ?/", "|");
y = Regex.Replace(y, "|]]", "| ]]");
}
ArticleText = ArticleText.Replace(x, y);
}
}
//repair bad internal links
ArticleText = Regex.Replace(ArticleText, @"/\[\[ ?([^\]]*?) ?\]\]/g", "[[$1]]");
ArticleText = Regex.Replace(ArticleText, @"/\[\[([^\]]*?)( |_)#([^\]]*?)\]\]/g", "[[$1#$3]]");
//repair bad external links
ArticleText = Regex.Replace(ArticleText, @"/\[?\[http:\/\/([^\]]*?)\]\]?/gi", "[http://$1]");
ArticleText = Regex.Replace(ArticleText, @"/\[http:\/\/([^\]]*?)\|([^\]]*?)\]/gi", "[http://$1 $2]");
return trim(ArticleText);
}
//fixes images
private string imagefixer(string ArticleText)
{
//remove external images
ArticleText = Regex.Replace(ArticleText, @"/\[?\[image:http:\/\/([^\]]*?)\]\]?/gi", "[http://$1]");
//fix links within internal images
Match m = Regex.Match(ArticleText, @"/\[?\[image:[^\[\]]*?(\[?\[[^\]]*?\]*?[^\[\]]*?)*?\]+/gi");
if (m.Success)
{
for (int i = 0; i < m.Length; i++)
{
string x = m.Groups[i].ToString();
string y = x;
y = Regex.Replace(y, @"/^\[\[i/i", "I");
y = Regex.Replace(y, @"/\]\]$/", "");
y = Regex.Replace(y, @"/(\[[^\]]*?)$/", "$1]");
y = linkfixer(y, true);
y = "[[" + y + "]]";
ArticleText = ArticleText.Replace(x, y);
}
}
return trim(ArticleText);
}
//trim start and end, trim spaces from the end of lines
private string trim(string ArticleText)
{
ArticleText = Regex.Replace(ArticleText, "/ $/gm", "");
return Regex.Replace(ArticleText, @"/^\s*|\s*$/g", "");
}
//entities that should never be unicoded
private string noUnicodify(string ArticleText)
{
ArticleText = ArticleText.Replace(" & ", " & ");
ArticleText = ArticleText.Replace("&", "&amp;");
ArticleText = ArticleText.Replace("&lt;", "&amp;lt;");
ArticleText = ArticleText.Replace("&gt;", "&amp;gt;");
ArticleText = ArticleText.Replace("&quot;", "&amp;quot;");
ArticleText = ArticleText.Replace("&apos;", "&amp;apos;");
ArticleText = ArticleText.Replace("−", "&minus;");
ArticleText = ArticleText.Replace("×", "&times;");
ArticleText = ArticleText.Replace(" ", "&nbsp;");
ArticleText = ArticleText.Replace(" ", "&thinsp;");
ArticleText = ArticleText.Replace("­", "&shy;");
ArticleText = ArticleText.Replace("′", "&prime;");
ArticleText = ArticleText.Replace("/&(#0?9[13];)/", "&$1");
ArticleText = ArticleText.Replace("/&(#0?12[345];)/", "&$1");
return ArticleText;
}