User:GreenC/software/search wikipedia
Method to accurately search Wikipedia
editFind all articles which contain the string "sportsillustrated.cnn.com" AND a {{dead}}
template AND < whatever > .. solving for complicated Wikipedia searches is trivial by downloading the Wikipedia database (dumps.wikimedia.org) and search using whatever tool you prefer. Here are two plug and play solutions.
Awk
editAwk is probably the simplest language available though with a speed trade-off for lack of a real XML parser. Nevertheless, no additional software is required (awk is a POSIX tool).
- To run: awk -f search-wp.awk > out
#!/bin/awk -f # Search entire Wikipedia database. # Download: https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia # BEGIN { MySearch = "archive.org/w?e?b?/?[0-9]{1,14}/" WPdump = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml" RS=("<page|</page>") while ((getline rawstr < WPdump ) > 0) { # Skip blank content if(! gensub(/^[[:space:]]+|[[:space:]]+$/, "", "g", rawstr)) continue # Convert XML formating gsub(/&lt;/,"<",rawstr);gsub(/&gt;/,">",rawstr);gsub(/&quot;/,"\"",rawstr);gsub(/&amp;/,"\\&",rawstr) # Get article title if ( match(rawstr, "<title>.+</title>", a) ) { split(a[0], b, "(<title>|</title>)") title = b[2] } # Get article body if ( match(rawstr, "<text xml:space=\"preserve\">.+</text>", a) ) { split(a[0], b, "(<text xml:space=\"preserve\">|</text>)") body = b[2] } # ---------- Search ----- if ( match(body, MySearch, matched_text) ) { print title # print matched_text[0] # uncomment to print continue } } close(r) }
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.
Nim
editFor a faster solution here is a Nim example. Nim compiles to optimized C code, which then compiles using gcc to an executable binary. In a test between Awk and Nim, it took Awk 3m31s to complete a search, the same in Nim took 0m43s. The code below is pretty much copy-paste compile and run, just add your RegEx Perl compatible regex, or plain text. Example regex strings:
- mySearchRe = re"djvu[.]txt"
- mySearchRe = re"http[:][^ ]*[^ ]"
- (the regex string is wrapped by re"" )
Then download Nim compiler (choosenim method is easiest), and compile the source with nim c -d:release --opt:speed -d:danger --passC:"-flto" --passL:"-flto" search.nim
.
# # Search wikipedia dump for a string and print the article title (or matched text) if located # Credit: Copyright User:Green_Cardamom, April 2016, MIT License # Language: Nim # Additional code credits: Rob Speer (https://github.com/rspeer/wiki2text) # import re, options, strutils, os, streams, parsexml var # configuration variables mySearchRe = re"djvu[.]txt" wpDump = "/mnt/WindowsFdriveTdir/wikipedia-dump/enwiki-20150901-pages-articles.xml" maxCount = 0 # Stop searching after X countArticle for speed testing. Set to 0 to find all. var countAllArticle = 0 # All article count countArticle = 0 # Article titles containing a match (any number of matches) countHits = 0 # Number of matches of search pattern (running total) type TagType = enum TITLE, TEXT, REDIRECT, NS ArticleData = array[TagType, string] # # Search text # proc searchText(article: ArticleData): bool {.discardable.} = var artcount = 0 pos = -1 # matches = newSeq[string](1) inc countAllArticle while pos < article[TEXT].len: pos = find(article[TEXT], mySearchRe, pos + 1) if pos == -1: break inc artcount if artcount > 0: inc countArticle # number of article titles matching countHits += artcount # number of matches of search pattern echo article[TITLE] result = true if maxCount > 0: if countAllArticle >= maxCount: echo "" echo "Articles all: ", countAllArticle echo "Articles with a match: ", countArticle echo "Number of pattern matches: ", countHits quit() var RELEVANT_XML_TAGS = ["title", "text", "ns"] textBuffer = "" s = newFileStream(wpDump, fmRead) gettingText = false gettingAttribute = false article: ArticleData xml: XmlParser if s == nil: quit("cannot open the file " & wpDump) for tag in TITLE..NS: article[tag] = "" xml.open(s, wpDump, options={reportWhitespace}) while true: # Scan through the XML, handling each token as it arrives. xml.next() case xml.kind of xmlElementStart, xmlElementOpen: if RELEVANT_XML_TAGS.contains(xml.elementName): # If this is a "title", "text", or "ns" tag, prepare to get its # text content. Move our writing pointer to the beginning of # the text buffer, so we can overwrite what was there. textBuffer.setLen(0) gettingText = true elif xml.elementName == "page": # If this is a new instance of the <page> tag that contains all # these tags, then reset the value that won't necessarily be # overridden, which is the redirect value. article[REDIRECT].setLen(0) elif xml.elementName == "redirect": # If this is the start of a redirect tag, prepare to get its # attribute value. gettingAttribute = true of xmlAttribute: # If we're looking for an attribute value, and we found one, add it # to the buffer. if gettingAttribute: textBuffer.add(xml.attrValue) of xmlCharData, xmlWhitespace: # If we're looking for text, and we found it, add it to the buffer. if gettingText: textBuffer.add(xml.charData) of xmlElementEnd: # When we reach the end of an element we care about, take the text # we've found and store it in the 'article' data structure. We can # accomplish this quickly by simply swapping their references. case xml.elementName of "title": swap article[TITLE], textBuffer of "text": swap article[TEXT], textBuffer of "redirect": swap article[REDIRECT], textBuffer of "ns": swap article[NS], textBuffer of "page": # When we reach the end of the <page> tag, send the article # data to searchText(). searchText(article) else: discard # Now that we've reached the end of an element, stop extracting # text. (We'll never need to extract text from elements that can # have other XML elements nested inside them.) gettingText = false gettingAttribute = false of xmlEof: break else: discard xml.close echo "Search Wikipedia completed" echo "----" echo "Articles all: ", countAllArticle echo "Articles with a match: ", countArticle echo "Number of pattern matches: ", countHits
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.