fix-htmldoc-utf8

This commit is contained in:
Frank Villaro-Dixon 2014-09-16 18:28:22 +02:00
parent f596ed889e
commit 8e94c040c1

211
Utils/fix-htmldoc-utf8.py Executable file
View file

@ -0,0 +1,211 @@
#!/usr/bin/env python2
# coding: utf-8
#
# Name : fix-htmldoc-utf8
# Summary: Program to fix UTF-8 characters that HTMLDOC has messed
# Author : Aurelio Jargas www.aurelio.net/soft
# License: BSD
# Release: April, 2008
#
# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file,
# all the special characters (not ASCII) will be incorrect in the resulting HTML.
# This program fixes this, restoring the original UTF-8 characters.
#
# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option
# fix the file in place.
#
# Examples:
# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html
# fix-htmldoc-utf8 myfile.html > myfile-ok.html
# fix-htmldoc-utf8 -w myfile.html
#
import sys
# You can add new chars to this mapping, if needed.
# The first set are the ISO-8859-1 extended chars.
# The second set are the Unicode chars I've found on my keyboard.
#
mapping = """
¡ ¡
¢ ¢
£ £
¤ ¤
¥ Â¥
¦ ¦
§ §
¨ ¨
© ©
ª ª
« «
¬ ¬
® ®
¯ ¯
° °
± ±
² ²
³ ³
´ ´
µ Âμ
¶
· ·
¸ ¸
¹ ¹
º º
» »
¼ ¼
½ ½
¾ ¾
¿ ¿
À Ã\x80
Á Ã\x81
 Ã\x82
à Ã\x83
Ä Ã\x84
Å Ã\x85
Æ Ã\x86
Ç Ã\x87
È Ã\x88
É Ã\x89
Ê Ã\x8a
Ë Ã\x8b
Ì Ã\x8c
Í Ã\x8d
Î Ã\x8e
Ï Ã\x8f
Ð Ã\x90
Ñ Ã\x91
Ò Ã\x92
Ó Ã\x93
Ô Ã\x94
Õ Ã\x95
Ö Ã\x96
× Ã\x97
Ø Ã\x98
Ù Ã\x99
Ú Ã\x9a
Û Ã\x9b
Ü Ã\x9c
Ý Ã\x9d
Þ Ã\x9e
ß Ã\x9f
à à
á á
â â
ã ã
ä ä
å Ã¥
æ æ
ç ç
è è
é é
ê ê
ë ë
ì ì
í í
î î
ï ï
ð ð
ñ ñ
ò ò
ó ó
ô ô
õ Ãμ
ö ö
÷ ÷
ø ø
ù ù
ú ú
û û
ü ü
ý ý
þ þ
ÿ ÿ

â\x84¢
â\x82¬
æ æ
Œ Å\x92
â\x89¤
â\x89 
â\x89¥
ï¬\x81
ï¬\x82
â\x88\x9e
â\x80¢
â\x81\x84
â\x89\x88
â\x97\x8a
â\x88\x91
â\x88\x8f
π Ï\x80
â\x88\x82
â\x88\x86
ƒ Æ\x92
Ω Î©
â\x88\x9a
â\x88«
â\x80 
â\x80¡
ı ı
â\x80º
˚ Ë\x9a
˙ Ë\x99
ˇ Ë\x87
˝ Ë\x9d
˛ Ë\x9b
â\x80\x98
â\x80\x99
â\x80\x9a
â\x80\x9c
â\x80\x9d
â\x80\x9e
â\x80¦
â\x80\x94
â\x80\x93
CHARSET=utf-8 CHARSET=iso-8859-1
CHARSET=utf-8 CHARSET=iso-iso-8859-1
"""
# Just a standard search & replace
def fixit(text):
for pair in mapping.split('\n'):
if not pair: continue
repl, patt = pair.split('\t')
text = text.replace(patt.strip(), repl.strip())
return text
# User wants to save the file in place or not?
write_file = False
if len(sys.argv) > 1 and sys.argv[1] == '-w':
write_file = True
sys.argv.pop(1)
# The input files (if any)
files = sys.argv[1:]
if files:
# Fix input files one by one
for this_file in files:
try:
# Read and fix
f = open(this_file, 'r')
fixed = fixit(f.read())
f.close()
# Save the file or show on STDOUT
if write_file:
f = open(this_file, 'w')
f.write(fixed)
f.close()
print "Fixed", this_file
else:
print fixed,
except:
print "Error fixing", this_file
sys.exit(1)
else:
# No input file, read from STDIN and send results to STDOUT
print fixit(sys.stdin.read()),