#!/usr/bin/env python2 # coding: utf-8 # # Name : fix-htmldoc-utf8 # Summary: Program to fix UTF-8 characters that HTMLDOC has messed # Author : Aurelio Jargas www.aurelio.net/soft # License: BSD # Release: April, 2008 # # HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file, # all the special characters (not ASCII) will be incorrect in the resulting HTML. # This program fixes this, restoring the original UTF-8 characters. # # Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option # fix the file in place. # # Examples: # cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html # fix-htmldoc-utf8 myfile.html > myfile-ok.html # fix-htmldoc-utf8 -w myfile.html # import sys # You can add new chars to this mapping, if needed. # The first set are the ISO-8859-1 extended chars. # The second set are the Unicode chars I've found on my keyboard. # mapping = """ ¡ ¡ ¢ ¢ £ £ ¤ ¤ ¥ Â¥ ¦ ¦ § § ¨ ¨ © © ª ª « « ¬ ¬ ® ® ¯ ¯ ° ° ± ± ² ² ³ ³ ´ ´ µ Âμ ¶ ¶ · · ¸ ¸ ¹ ¹ º º » » ¼ ¼ ½ ½ ¾ ¾ ¿ ¿ À Ã\x80 Á Ã\x81  Ã\x82 à Ã\x83 Ä Ã\x84 Å Ã\x85 Æ Ã\x86 Ç Ã\x87 È Ã\x88 É Ã\x89 Ê Ã\x8a Ë Ã\x8b Ì Ã\x8c Í Ã\x8d Î Ã\x8e Ï Ã\x8f Ð Ã\x90 Ñ Ã\x91 Ò Ã\x92 Ó Ã\x93 Ô Ã\x94 Õ Ã\x95 Ö Ã\x96 × Ã\x97 Ø Ã\x98 Ù Ã\x99 Ú Ã\x9a Û Ã\x9b Ü Ã\x9c Ý Ã\x9d Þ Ã\x9e ß Ã\x9f à à á á â â ã ã ä ä å Ã¥ æ æ ç ç è è é é ê ê ë ë ì ì í í î î ï ï ð ð ñ ñ ò ò ó ó ô ô õ Ãμ ö ö ÷ ÷ ø ø ù ù ú ú û û ü ü ý ý þ þ ÿ ÿ   ™ â\x84¢ € â\x82¬ æ æ Œ Å\x92 ≤ â\x89¤ ≠ â\x89  ≥ â\x89¥ fi ï¬\x81 fl ï¬\x82 ∞ â\x88\x9e • â\x80¢ ⁄ â\x81\x84 ≈ â\x89\x88 ◊ â\x97\x8a ∑ â\x88\x91 ∏ â\x88\x8f π Ï\x80 ∂ â\x88\x82 ∆ â\x88\x86 ƒ Æ\x92 Ω Î© √ â\x88\x9a ∫ â\x88« † â\x80  ‡ â\x80¡ ı ı › â\x80º ˚ Ë\x9a ˙ Ë\x99 ˇ Ë\x87 ˝ Ë\x9d ˛ Ë\x9b ‘ â\x80\x98 ’ â\x80\x99 ‚ â\x80\x9a “ â\x80\x9c ” â\x80\x9d „ â\x80\x9e … â\x80¦ — â\x80\x94 – â\x80\x93 CHARSET=utf-8 CHARSET=iso-8859-1 CHARSET=utf-8 CHARSET=iso-iso-8859-1 """ # Just a standard search & replace def fixit(text): for pair in mapping.split('\n'): if not pair: continue repl, patt = pair.split('\t') text = text.replace(patt.strip(), repl.strip()) return text # User wants to save the file in place or not? write_file = False if len(sys.argv) > 1 and sys.argv[1] == '-w': write_file = True sys.argv.pop(1) # The input files (if any) files = sys.argv[1:] if files: # Fix input files one by one for this_file in files: try: # Read and fix f = open(this_file, 'r') fixed = fixit(f.read()) f.close() # Save the file or show on STDOUT if write_file: f = open(this_file, 'w') f.write(fixed) f.close() print "Fixed", this_file else: print fixed, except: print "Error fixing", this_file sys.exit(1) else: # No input file, read from STDIN and send results to STDOUT print fixit(sys.stdin.read()),