211 lines
4.2 KiB
Python
Executable file
211 lines
4.2 KiB
Python
Executable file
#!/usr/bin/env python2
|
||
# coding: utf-8
|
||
#
|
||
# Name : fix-htmldoc-utf8
|
||
# Summary: Program to fix UTF-8 characters that HTMLDOC has messed
|
||
# Author : Aurelio Jargas www.aurelio.net/soft
|
||
# License: BSD
|
||
# Release: April, 2008
|
||
#
|
||
# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file,
|
||
# all the special characters (not ASCII) will be incorrect in the resulting HTML.
|
||
# This program fixes this, restoring the original UTF-8 characters.
|
||
#
|
||
# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option
|
||
# fix the file in place.
|
||
#
|
||
# Examples:
|
||
# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html
|
||
# fix-htmldoc-utf8 myfile.html > myfile-ok.html
|
||
# fix-htmldoc-utf8 -w myfile.html
|
||
#
|
||
|
||
import sys
|
||
|
||
# You can add new chars to this mapping, if needed.
|
||
# The first set are the ISO-8859-1 extended chars.
|
||
# The second set are the Unicode chars I've found on my keyboard.
|
||
#
|
||
mapping = """
|
||
¡ ¡
|
||
¢ ¢
|
||
£ £
|
||
¤ ¤
|
||
¥ Â¥
|
||
¦ ¦
|
||
§ §
|
||
¨ ¨
|
||
© ©
|
||
ª ª
|
||
« «
|
||
¬ ¬
|
||
® ®
|
||
¯ ¯
|
||
° °
|
||
± ±
|
||
² ²
|
||
³ ³
|
||
´ ´
|
||
µ Âμ
|
||
¶ ¶
|
||
· ·
|
||
¸ ¸
|
||
¹ ¹
|
||
º º
|
||
» »
|
||
¼ ¼
|
||
½ ½
|
||
¾ ¾
|
||
¿ ¿
|
||
À Ã\x80
|
||
Á Ã\x81
|
||
 Ã\x82
|
||
à Ã\x83
|
||
Ä Ã\x84
|
||
Å Ã\x85
|
||
Æ Ã\x86
|
||
Ç Ã\x87
|
||
È Ã\x88
|
||
É Ã\x89
|
||
Ê Ã\x8a
|
||
Ë Ã\x8b
|
||
Ì Ã\x8c
|
||
Í Ã\x8d
|
||
Î Ã\x8e
|
||
Ï Ã\x8f
|
||
Ð Ã\x90
|
||
Ñ Ã\x91
|
||
Ò Ã\x92
|
||
Ó Ã\x93
|
||
Ô Ã\x94
|
||
Õ Ã\x95
|
||
Ö Ã\x96
|
||
× Ã\x97
|
||
Ø Ã\x98
|
||
Ù Ã\x99
|
||
Ú Ã\x9a
|
||
Û Ã\x9b
|
||
Ü Ã\x9c
|
||
Ý Ã\x9d
|
||
Þ Ã\x9e
|
||
ß Ã\x9f
|
||
à Ã
|
||
á á
|
||
â â
|
||
ã ã
|
||
ä ä
|
||
å Ã¥
|
||
æ æ
|
||
ç ç
|
||
è è
|
||
é é
|
||
ê ê
|
||
ë ë
|
||
ì ì
|
||
í í
|
||
î î
|
||
ï ï
|
||
ð ð
|
||
ñ ñ
|
||
ò ò
|
||
ó ó
|
||
ô ô
|
||
õ Ãμ
|
||
ö ö
|
||
÷ ÷
|
||
ø ø
|
||
ù ù
|
||
ú ú
|
||
û û
|
||
ü ü
|
||
ý ý
|
||
þ þ
|
||
ÿ ÿ
|
||
|
||

|
||
™ â\x84¢
|
||
€ â\x82¬
|
||
æ æ
|
||
Œ Å\x92
|
||
≤ â\x89¤
|
||
≠ â\x89
|
||
≥ â\x89¥
|
||
fi ï¬\x81
|
||
fl ï¬\x82
|
||
∞ â\x88\x9e
|
||
• â\x80¢
|
||
⁄ â\x81\x84
|
||
≈ â\x89\x88
|
||
◊ â\x97\x8a
|
||
∑ â\x88\x91
|
||
∏ â\x88\x8f
|
||
π Ï\x80
|
||
∂ â\x88\x82
|
||
∆ â\x88\x86
|
||
ƒ Æ\x92
|
||
Ω Î©
|
||
√ â\x88\x9a
|
||
∫ â\x88«
|
||
† â\x80
|
||
‡ â\x80¡
|
||
ı ı
|
||
› â\x80º
|
||
˚ Ë\x9a
|
||
˙ Ë\x99
|
||
ˇ Ë\x87
|
||
˝ Ë\x9d
|
||
˛ Ë\x9b
|
||
‘ â\x80\x98
|
||
’ â\x80\x99
|
||
‚ â\x80\x9a
|
||
“ â\x80\x9c
|
||
” â\x80\x9d
|
||
„ â\x80\x9e
|
||
… â\x80¦
|
||
— â\x80\x94
|
||
– â\x80\x93
|
||
|
||
CHARSET=utf-8 CHARSET=iso-8859-1
|
||
CHARSET=utf-8 CHARSET=iso-iso-8859-1
|
||
"""
|
||
|
||
# Just a standard search & replace
|
||
def fixit(text):
|
||
for pair in mapping.split('\n'):
|
||
if not pair: continue
|
||
repl, patt = pair.split('\t')
|
||
text = text.replace(patt.strip(), repl.strip())
|
||
return text
|
||
|
||
# User wants to save the file in place or not?
|
||
write_file = False
|
||
if len(sys.argv) > 1 and sys.argv[1] == '-w':
|
||
write_file = True
|
||
sys.argv.pop(1)
|
||
|
||
# The input files (if any)
|
||
files = sys.argv[1:]
|
||
|
||
if files:
|
||
# Fix input files one by one
|
||
for this_file in files:
|
||
try:
|
||
# Read and fix
|
||
f = open(this_file, 'r')
|
||
fixed = fixit(f.read())
|
||
f.close()
|
||
|
||
# Save the file or show on STDOUT
|
||
if write_file:
|
||
f = open(this_file, 'w')
|
||
f.write(fixed)
|
||
f.close()
|
||
print "Fixed", this_file
|
||
else:
|
||
print fixed,
|
||
except:
|
||
print "Error fixing", this_file
|
||
sys.exit(1)
|
||
else:
|
||
# No input file, read from STDIN and send results to STDOUT
|
||
print fixit(sys.stdin.read()),
|