212 lines
4.2 KiB
Text
212 lines
4.2 KiB
Text
|
#!/usr/bin/env python2
|
|||
|
# coding: utf-8
|
|||
|
#
|
|||
|
# Name : fix-htmldoc-utf8
|
|||
|
# Summary: Program to fix UTF-8 characters that HTMLDOC has messed
|
|||
|
# Author : Aurelio Jargas www.aurelio.net/soft
|
|||
|
# License: BSD
|
|||
|
# Release: April, 2008
|
|||
|
#
|
|||
|
# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file,
|
|||
|
# all the special characters (not ASCII) will be incorrect in the resulting HTML.
|
|||
|
# This program fixes this, restoring the original UTF-8 characters.
|
|||
|
#
|
|||
|
# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option
|
|||
|
# fix the file in place.
|
|||
|
#
|
|||
|
# Examples:
|
|||
|
# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html
|
|||
|
# fix-htmldoc-utf8 myfile.html > myfile-ok.html
|
|||
|
# fix-htmldoc-utf8 -w myfile.html
|
|||
|
#
|
|||
|
|
|||
|
import sys
|
|||
|
|
|||
|
# You can add new chars to this mapping, if needed.
|
|||
|
# The first set are the ISO-8859-1 extended chars.
|
|||
|
# The second set are the Unicode chars I've found on my keyboard.
|
|||
|
#
|
|||
|
mapping = """
|
|||
|
¡ ¡
|
|||
|
¢ ¢
|
|||
|
£ £
|
|||
|
¤ ¤
|
|||
|
¥ Â¥
|
|||
|
¦ ¦
|
|||
|
§ §
|
|||
|
¨ ¨
|
|||
|
© ©
|
|||
|
ª ª
|
|||
|
« «
|
|||
|
¬ ¬
|
|||
|
® ®
|
|||
|
¯ ¯
|
|||
|
° °
|
|||
|
± ±
|
|||
|
² ²
|
|||
|
³ ³
|
|||
|
´ ´
|
|||
|
µ Âμ
|
|||
|
¶ ¶
|
|||
|
· ·
|
|||
|
¸ ¸
|
|||
|
¹ ¹
|
|||
|
º º
|
|||
|
» »
|
|||
|
¼ ¼
|
|||
|
½ ½
|
|||
|
¾ ¾
|
|||
|
¿ ¿
|
|||
|
À Ã\x80
|
|||
|
Á Ã\x81
|
|||
|
 Ã\x82
|
|||
|
à Ã\x83
|
|||
|
Ä Ã\x84
|
|||
|
Å Ã\x85
|
|||
|
Æ Ã\x86
|
|||
|
Ç Ã\x87
|
|||
|
È Ã\x88
|
|||
|
É Ã\x89
|
|||
|
Ê Ã\x8a
|
|||
|
Ë Ã\x8b
|
|||
|
Ì Ã\x8c
|
|||
|
Í Ã\x8d
|
|||
|
Î Ã\x8e
|
|||
|
Ï Ã\x8f
|
|||
|
Ð Ã\x90
|
|||
|
Ñ Ã\x91
|
|||
|
Ò Ã\x92
|
|||
|
Ó Ã\x93
|
|||
|
Ô Ã\x94
|
|||
|
Õ Ã\x95
|
|||
|
Ö Ã\x96
|
|||
|
× Ã\x97
|
|||
|
Ø Ã\x98
|
|||
|
Ù Ã\x99
|
|||
|
Ú Ã\x9a
|
|||
|
Û Ã\x9b
|
|||
|
Ü Ã\x9c
|
|||
|
Ý Ã\x9d
|
|||
|
Þ Ã\x9e
|
|||
|
ß Ã\x9f
|
|||
|
à Ã
|
|||
|
á á
|
|||
|
â â
|
|||
|
ã ã
|
|||
|
ä ä
|
|||
|
å Ã¥
|
|||
|
æ æ
|
|||
|
ç ç
|
|||
|
è è
|
|||
|
é é
|
|||
|
ê ê
|
|||
|
ë ë
|
|||
|
ì ì
|
|||
|
í í
|
|||
|
î î
|
|||
|
ï ï
|
|||
|
ð ð
|
|||
|
ñ ñ
|
|||
|
ò ò
|
|||
|
ó ó
|
|||
|
ô ô
|
|||
|
õ Ãμ
|
|||
|
ö ö
|
|||
|
÷ ÷
|
|||
|
ø ø
|
|||
|
ù ù
|
|||
|
ú ú
|
|||
|
û û
|
|||
|
ü ü
|
|||
|
ý ý
|
|||
|
þ þ
|
|||
|
ÿ ÿ
|
|||
|
|
|||
|

|
|||
|
™ â\x84¢
|
|||
|
€ â\x82¬
|
|||
|
æ æ
|
|||
|
Œ Å\x92
|
|||
|
≤ â\x89¤
|
|||
|
≠ â\x89
|
|||
|
≥ â\x89¥
|
|||
|
fi ï¬\x81
|
|||
|
fl ï¬\x82
|
|||
|
∞ â\x88\x9e
|
|||
|
• â\x80¢
|
|||
|
⁄ â\x81\x84
|
|||
|
≈ â\x89\x88
|
|||
|
◊ â\x97\x8a
|
|||
|
∑ â\x88\x91
|
|||
|
∏ â\x88\x8f
|
|||
|
π Ï\x80
|
|||
|
∂ â\x88\x82
|
|||
|
∆ â\x88\x86
|
|||
|
ƒ Æ\x92
|
|||
|
Ω Î©
|
|||
|
√ â\x88\x9a
|
|||
|
∫ â\x88«
|
|||
|
† â\x80
|
|||
|
‡ â\x80¡
|
|||
|
ı ı
|
|||
|
› â\x80º
|
|||
|
˚ Ë\x9a
|
|||
|
˙ Ë\x99
|
|||
|
ˇ Ë\x87
|
|||
|
˝ Ë\x9d
|
|||
|
˛ Ë\x9b
|
|||
|
‘ â\x80\x98
|
|||
|
’ â\x80\x99
|
|||
|
‚ â\x80\x9a
|
|||
|
“ â\x80\x9c
|
|||
|
” â\x80\x9d
|
|||
|
„ â\x80\x9e
|
|||
|
… â\x80¦
|
|||
|
— â\x80\x94
|
|||
|
– â\x80\x93
|
|||
|
|
|||
|
CHARSET=utf-8 CHARSET=iso-8859-1
|
|||
|
CHARSET=utf-8 CHARSET=iso-iso-8859-1
|
|||
|
"""
|
|||
|
|
|||
|
# Just a standard search & replace
|
|||
|
def fixit(text):
|
|||
|
for pair in mapping.split('\n'):
|
|||
|
if not pair: continue
|
|||
|
repl, patt = pair.split('\t')
|
|||
|
text = text.replace(patt.strip(), repl.strip())
|
|||
|
return text
|
|||
|
|
|||
|
# User wants to save the file in place or not?
|
|||
|
write_file = False
|
|||
|
if len(sys.argv) > 1 and sys.argv[1] == '-w':
|
|||
|
write_file = True
|
|||
|
sys.argv.pop(1)
|
|||
|
|
|||
|
# The input files (if any)
|
|||
|
files = sys.argv[1:]
|
|||
|
|
|||
|
if files:
|
|||
|
# Fix input files one by one
|
|||
|
for this_file in files:
|
|||
|
try:
|
|||
|
# Read and fix
|
|||
|
f = open(this_file, 'r')
|
|||
|
fixed = fixit(f.read())
|
|||
|
f.close()
|
|||
|
|
|||
|
# Save the file or show on STDOUT
|
|||
|
if write_file:
|
|||
|
f = open(this_file, 'w')
|
|||
|
f.write(fixed)
|
|||
|
f.close()
|
|||
|
print "Fixed", this_file
|
|||
|
else:
|
|||
|
print fixed,
|
|||
|
except:
|
|||
|
print "Error fixing", this_file
|
|||
|
sys.exit(1)
|
|||
|
else:
|
|||
|
# No input file, read from STDIN and send results to STDOUT
|
|||
|
print fixit(sys.stdin.read()),
|