diff --git a/Utils/fix-htmldoc-utf8.py b/Utils/fix-htmldoc-utf8.py new file mode 100755 index 0000000..4e26604 --- /dev/null +++ b/Utils/fix-htmldoc-utf8.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python2 +# coding: utf-8 +# +# Name : fix-htmldoc-utf8 +# Summary: Program to fix UTF-8 characters that HTMLDOC has messed +# Author : Aurelio Jargas www.aurelio.net/soft +# License: BSD +# Release: April, 2008 +# +# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file, +# all the special characters (not ASCII) will be incorrect in the resulting HTML. +# This program fixes this, restoring the original UTF-8 characters. +# +# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option +# fix the file in place. +# +# Examples: +# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html +# fix-htmldoc-utf8 myfile.html > myfile-ok.html +# fix-htmldoc-utf8 -w myfile.html +# + +import sys + +# You can add new chars to this mapping, if needed. +# The first set are the ISO-8859-1 extended chars. +# The second set are the Unicode chars I've found on my keyboard. +# +mapping = """ +¡ ¡ +¢ ¢ +£ £ +¤ ¤ +¥ Â¥ +¦ ¦ +§ § +¨ ¨ +© © +ª ª +« « +¬ ¬ +® ® +¯ ¯ +° ° +± ± +² ² +³ ³ +´ ´ +µ Âμ +¶ ¶ +· · +¸ ¸ +¹ ¹ +º º +» » +¼ ¼ +½ ½ +¾ ¾ +¿ ¿ +À Ã\x80 +Á Ã\x81 + Ã\x82 +à Ã\x83 +Ä Ã\x84 +Å Ã\x85 +Æ Ã\x86 +Ç Ã\x87 +È Ã\x88 +É Ã\x89 +Ê Ã\x8a +Ë Ã\x8b +Ì Ã\x8c +Í Ã\x8d +Î Ã\x8e +Ï Ã\x8f +Ð Ã\x90 +Ñ Ã\x91 +Ò Ã\x92 +Ó Ã\x93 +Ô Ã\x94 +Õ Ã\x95 +Ö Ã\x96 +× Ã\x97 +Ø Ã\x98 +Ù Ã\x99 +Ú Ã\x9a +Û Ã\x9b +Ü Ã\x9c +Ý Ã\x9d +Þ Ã\x9e +ß Ã\x9f +à à +á á +â â +ã ã +ä ä +å Ã¥ +æ æ +ç ç +è è +é é +ê ê +ë ë +ì ì +í í +î î +ï ï +ð ð +ñ ñ +ò ò +ó ó +ô ô +õ Ãμ +ö ö +÷ ÷ +ø ø +ù ù +ú ú +û û +ü ü +ý ý +þ þ +ÿ ÿ + +  +™ â\x84¢ +€ â\x82¬ +æ æ +Œ Å\x92 +≤ â\x89¤ +≠ â\x89  +≥ â\x89¥ +fi ï¬\x81 +fl ï¬\x82 +∞ â\x88\x9e +• â\x80¢ +⁄ â\x81\x84 +≈ â\x89\x88 +◊ â\x97\x8a +∑ â\x88\x91 +∏ â\x88\x8f +π Ï\x80 +∂ â\x88\x82 +∆ â\x88\x86 +ƒ Æ\x92 +Ω Î© +√ â\x88\x9a +∫ â\x88« +† â\x80  +‡ â\x80¡ +ı ı +› â\x80º +˚ Ë\x9a +˙ Ë\x99 +ˇ Ë\x87 +˝ Ë\x9d +˛ Ë\x9b +‘ â\x80\x98 +’ â\x80\x99 +‚ â\x80\x9a +“ â\x80\x9c +” â\x80\x9d +„ â\x80\x9e +… â\x80¦ +— â\x80\x94 +– â\x80\x93 + +CHARSET=utf-8 CHARSET=iso-8859-1 +CHARSET=utf-8 CHARSET=iso-iso-8859-1 +""" + +# Just a standard search & replace +def fixit(text): + for pair in mapping.split('\n'): + if not pair: continue + repl, patt = pair.split('\t') + text = text.replace(patt.strip(), repl.strip()) + return text + +# User wants to save the file in place or not? +write_file = False +if len(sys.argv) > 1 and sys.argv[1] == '-w': + write_file = True + sys.argv.pop(1) + +# The input files (if any) +files = sys.argv[1:] + +if files: + # Fix input files one by one + for this_file in files: + try: + # Read and fix + f = open(this_file, 'r') + fixed = fixit(f.read()) + f.close() + + # Save the file or show on STDOUT + if write_file: + f = open(this_file, 'w') + f.write(fixed) + f.close() + print "Fixed", this_file + else: + print fixed, + except: + print "Error fixing", this_file + sys.exit(1) +else: + # No input file, read from STDIN and send results to STDOUT + print fixit(sys.stdin.read()),