function uploadentry(url, title, work = "", metatitle = "", isdead = false ) {
const inquery = "INSERT INTO web (url, title, work, metatitle, isdead) VALUES ($1,$2,$3,$4,$5)"
var insertarray = [url, title, work, metatitle, isdead]
var res = sql.query(inquery, insertarray)
}
function getAndTagTitle(geturl) {
var isDead = false
var browser = webkit.launch() // Open up a web browser. Webkit (Safari) is relatively fast, light on memory / processing power, and works on all of the major operating systems.
const page = browser.newPage(); // Open up a new page
await page.goto(geturl) // Go to the page.
var statusnum = result.status() // Get status code
//https://developer.mozilla.org/en-US/docs/Web/HTTP/Status for more info.
if (statusnum == 404 || statusnum == 410) {
isDead = true
}
if (statusnum => 400 && statusnum < 500) {
return // Some sort of error. Do nothing, ignore. When coming across the URL there will be no match and url will be left aone
}
if (statusnum => 500 && statusnum < 600) {
return // Same other above.
}
const title = page.title()
var additionalinfo = {}
additionalinfo.metatitle = title // upgrade 1/28/2022, see bot page discussion.
additionalinfo.work = new URL(geturl).hostname // If the website is "www.website.notexist/ffsdf", the "work" will be "www.website.notexist"
uploadentry(geturl, title, additionalinfo.work, additionalinfo.metatitle, isDead)
return // entry has been uploaded
}
function traverse(refitem ) {
var traversedcount = -1
var removebaretemp = false // tracking category for multiple bare refs
for (refobj of refitem) { // iterate over parser "objects" in the <ref></ref> in question
traversedcount = traversedcount + 1 // count of objects traversed.
if (typeof refobj == "string") {
// This is a recursive function, so sometimes it calls a function on a string
// A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out.
return
}
if (refobj.type == "url" && refobj.is_bare == true ) {
usethisurl = refobj[0].toString()
if (usethisurl.indexOf("archive.") >= 0 || // everything else (note the . at the end)
usethisurl.indexOf("webcit") >= 0 || // webcite
usethisurl.indexOf("youtube.com") >= 0 ||
usethisurl.indexOf("twitter.com") >= 0 ||
usethisurl.indexOf("facebook.com") >= 0 ||
usethisurl.indexOf("instagram.com") >= 0) {
// Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet
continue
}
var shoulddo = true
for (refobj2 of refitem) { // iterate through the whole thing again to check for undeseriables
if (typeof refobj2 == "string" && refobj2.trim() != "") {
shoulddo = false // lets not fix middle ones. For exaple <ref>https://website.website is an amazing site</ref> is not something that should be filled
break
}
if (refobj2.type == "transclusion" && refobj2.name.toLowerCase() != "bare url inline") {
// If there is some sort of transcluion in the <ref></ref> that is not recognized, skip as it might be out of scope.
shoulddo = false
break
}
}
if (!shoulddo) {
continue
}
usethisurl = usethisurl.replaceAll("|", "%7C") // escape for CS1
parsethis = parsethis + " |url=" + usethisurl
if (usethisurl.indexOf(".pdf") >=0) {
continue
}
getAndTagTitle(usethisurl)
}
if (obj.type == "tag_inner") {
traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness.
}
}
}
function main(filename) {
var wikitxt = fs.readFileSync(filename).toString()
var page_data = CeL.net.wiki.parser(wikitxt)
parsed_data = page_data.parse()
parsed_data.each("tag_inner", function refprocess(token, index, parent) {
if (!parent || parent.tag != "ref") {
// we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections)
return
}
traverse(token, datetype)
})
}