A small script that blindly reads each line as UTF-8 (or failing that, Windows Codepage 1252) and converts any non-ASCII characters to HTML entities. The input defaults to stdin. The user may specify one or more filenames as command line arguments.
Requires Python 2.3 or better.
#!/usr/bin/env python from htmlentitydefs import codepoint2name import fileinput, re reNonASCII = re.compile(u'[\u0080-\uffff]', re.UNICODE) def replaceNonASCII(match): '''Replace a unicode character with a named XHTML entity if possible, and a decimal entity otherwise. Note, we do not concern ourselves with escaping 'unsafe' characters such as &, we assume the input text is already properly escaped.''' c = ord(match.group()) try: return '&%s;' % codepoint2name[c] except KeyError: return '&#%d;' % c if __name__ == '__main__': for l in fileinput.input(): try: l = l.decode('utf-8') except UnicodeDecodeError: l = l.decode('windows-1252') print reNonASCII.sub(replaceNonASCII, l),