// This script takes kanji with ruby text over it and removes repeated parts
// It's called automatically by showKanji.js if any furigana was added
// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation
// strings that are in both the base and reading, and splits on these. This does
// not take into account any lexical information (so it doesn't know anything about
// particles or individual kanji readings). It can also fail for more complicated
// cases, but the script should be able to abort for these (maybe in the future we can
// continue and just ignore that specific base and substring).
// License: CC0
function getKanjiInfo() {
// Don't run if the kanji or the ruby is hidden
if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") {
return;
}
var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
var kana = $("#kanjiInfo rt").text();
if (!kanji || !kana) {
return;
}
var bases = [kanji];
var readings = [kana];
bindKana(bases, readings);
// If any binding occured
if (bases.length > 1) {
displayBoundKana(bases, readings);
}
}
function bindKana(bases, readings) {
var iterations = 0;
var maxIterations = 25;
var foundBindings = true;
while (foundBindings && iterations != maxIterations) {
iterations++;
foundBindings = tryBind(bases, readings);
}
// Sanity check
if (bases.length != readings.length) {
throw new Error("bindKana.js: Bases and readings arrays don't have same lengths.");
}
// Check kanji:kana ratio
for (var i = 0; i < bases.length; i++) {
var kanjiLength = bases[i].length;
var kanaLength = readings[i].length;
if (kanjiLength === 0 || kanaLength === 0) { continue; }
var ratio = kanaLength / kanjiLength;
if (ratio >= 6 || ratio <= 1/6) {
throw new Error("bindKana.js: kanji:kana ratio greater than 6 for `"
+ bases[i] + "` and `" + readings[i] + "`.");
}
}
if (iterations == maxIterations - 1) {
console.warn("bindKana.js: Encountered maximum iterations.");
if (bases.length == 1) {
throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once.");
}
}
}
function tryBind(bases, readings) {
var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe,
kanaRegexes.hiraganaRe, kanaRegexes.miscRe];
var baseLength = bases.length;
for (var i = 0; i < baseLength; i++) {
if (readings[i] === "") {
continue;
}
for (var regex of regexes) {
searchBase(bases, readings, i, regex);
if (bases.length != baseLength) {
break;
}
}
}
if (bases.length != baseLength) {
// Make sure splitting didn't mess up the bindings
for (var j = 0; j < bases.length; j++) {
if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") {
throw new Error("bindKana.js: Kanji base with no reading: `"
+ bases[j] + "` at index " + j);
} else if (bases[j] === "" && readings[j]) {
throw new Error("bindKana.js: Blank base with reading: `"
+ readings[j] + "` at index " + j);
}
}
return true;
} else {
return false;
}
}
function searchBase(bases, readings, index, re) {
var baseLength = bases.length;
var substring = bases[index].match(re);
if (substring) {
for (var j = 0; j < substring.length; j++) {
// Handle case where the furigana is just a hiragana version of the katakana
// Only works if whole thing is split along the reading
if (re == kanaRegexes.katakanaRe && /^[ァ-ヴ]+$/.test(bases[index])
&& bases[index] == readings[index].hiraganaToKatakana()) {
readings[index] = readings[index].hiraganaToKatakana();
}
// Misc stuff like whitespace should be split searching forward
if (re !== kanaRegexes.miscRe) {
splitFuriganaReverse(bases, readings, index, substring[j]);
} else {
splitFuriganaForward(bases, readings, index, substring[j]);
}
// Check if we split on the substring
if (bases.length != baseLength) {
// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
if (bases.length != baseLength + 2) {
throw new Error("bindKana.js: Splitting added more than two new parts.");
}
return;
}
}
}
}
String.prototype.hiraganaToKatakana = function() {
return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};
// We search for everything reversed because particles are suffixes
function splitFuriganaReverse(bases, readings, index, substring) {
var baseReversed = reverseString(bases[index]);
var readingReversed = reverseString(readings[index]);
var substringReversed = reverseString(substring);
var substringEscaped = mw.util.escapeRegExp(substringReversed);
var substringRe = new RegExp(substringEscaped);
// We match everything to left of substring, substring, and then right side
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
// First make sure substring is in both the base and its reading
if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
// Insert substring into base
var baseSearch = baseReversed.match(substringSearch);
// AaBbCc -> "cC" | "bB" | "aA"
var baseLeftSide = reverseString(baseSearch[3]);
var baseRightSide = reverseString(baseSearch[1]);
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readingReversed.match(substringSearch);
var readingLeftSide = reverseString(readingSearch[3]);
var readingRightSide = reverseString(readingSearch[1]);
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function reverseString(str) {
return str.split("").reverse().join("");
}
// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
var substringEscaped = mw.util.escapeRegExp(substring);
var substringRe = new RegExp(substringEscaped);
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
var baseSearch = bases[index].match(substringSearch);
var baseLeftSide = baseSearch[1];
var baseRightSide = baseSearch[3];
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readings[index].match(substringSearch);
var readingLeftSide = readingSearch[1];
var readingRightSide = readingSearch[3];
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function displayBoundKana(bases, readings) {
$("#kanjiInfo ruby").addClass("unbound");
$(".unbound").css("display", "none");
var fromWikidata = false;
// Build new ruby element from the two bases and readings arrays
var newKana = "<ruby class='bound'>";
for (var i = 0; i < bases.length; i++) {
newKana += "<rb>" + bases[i] + "</rb>";
newKana += "<rt>" + readings[i] + "</rt>";
}
newKana += "</ruby>";
$("#kanjiInfo").append(newKana);
prettifyEnds();
}
function prettifyEnds() {
// Exclude misc characters from base; for nicer formatting
$("#kanjiInfo rb").each(function(){
var baseText = $(this).text();
// Rm empty ruby base and readings
if (baseText === "") {
$(this).next().remove();
$(this).remove();
return;
} else if (baseText === " ") {
return;
}
var start = baseText[0];
kanaRegexes.miscRe.lastIndex = 0; // reset regex
if (kanaRegexes.miscRe.test(start)) {
var startRemainder = baseText.slice(1);
$(this).text(startRemainder);
$(this).before("<rb>" + start + "</rb><rt></rt>");
}
baseText = $(this).text();
kanaRegexes.miscRe.lastIndex = 0;
var end = baseText.slice(-1);
if (kanaRegexes.miscRe.test(end)) {
var len = baseText.length;
var endRemainder = baseText.slice(0, len-1);
$(this).text(endRemainder);
$(this).next().after("<rb>" + end + "</rb><rt></rt>");
}
});
}
var kanaRegexes = {
kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/,
// kanjiRe: /[一-龯]+/g,
hiraganaRe: /[ぁ-ゔ]+/g,
katakanaRe: /[ァ-ヴー]+/g,
alphanumRe: /[A-Za-z0-9]+/g,
miscRe: /[- !.?・、「」×〜&/]/g
}
getKanjiInfo();