fix-htmldoc-utf8
This commit is contained in:
parent
f596ed889e
commit
8e94c040c1
1 changed files with 211 additions and 0 deletions
211
Utils/fix-htmldoc-utf8.py
Executable file
211
Utils/fix-htmldoc-utf8.py
Executable file
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/env python2
|
||||
# coding: utf-8
|
||||
#
|
||||
# Name : fix-htmldoc-utf8
|
||||
# Summary: Program to fix UTF-8 characters that HTMLDOC has messed
|
||||
# Author : Aurelio Jargas www.aurelio.net/soft
|
||||
# License: BSD
|
||||
# Release: April, 2008
|
||||
#
|
||||
# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file,
|
||||
# all the special characters (not ASCII) will be incorrect in the resulting HTML.
|
||||
# This program fixes this, restoring the original UTF-8 characters.
|
||||
#
|
||||
# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option
|
||||
# fix the file in place.
|
||||
#
|
||||
# Examples:
|
||||
# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html
|
||||
# fix-htmldoc-utf8 myfile.html > myfile-ok.html
|
||||
# fix-htmldoc-utf8 -w myfile.html
|
||||
#
|
||||
|
||||
import sys
|
||||
|
||||
# You can add new chars to this mapping, if needed.
|
||||
# The first set are the ISO-8859-1 extended chars.
|
||||
# The second set are the Unicode chars I've found on my keyboard.
|
||||
#
|
||||
mapping = """
|
||||
¡ ¡
|
||||
¢ ¢
|
||||
£ £
|
||||
¤ ¤
|
||||
¥ Â¥
|
||||
¦ ¦
|
||||
§ §
|
||||
¨ ¨
|
||||
© ©
|
||||
ª ª
|
||||
« «
|
||||
¬ ¬
|
||||
® ®
|
||||
¯ ¯
|
||||
° °
|
||||
± ±
|
||||
² ²
|
||||
³ ³
|
||||
´ ´
|
||||
µ Âμ
|
||||
¶ ¶
|
||||
· ·
|
||||
¸ ¸
|
||||
¹ ¹
|
||||
º º
|
||||
» »
|
||||
¼ ¼
|
||||
½ ½
|
||||
¾ ¾
|
||||
¿ ¿
|
||||
À Ã\x80
|
||||
Á Ã\x81
|
||||
 Ã\x82
|
||||
à Ã\x83
|
||||
Ä Ã\x84
|
||||
Å Ã\x85
|
||||
Æ Ã\x86
|
||||
Ç Ã\x87
|
||||
È Ã\x88
|
||||
É Ã\x89
|
||||
Ê Ã\x8a
|
||||
Ë Ã\x8b
|
||||
Ì Ã\x8c
|
||||
Í Ã\x8d
|
||||
Î Ã\x8e
|
||||
Ï Ã\x8f
|
||||
Ð Ã\x90
|
||||
Ñ Ã\x91
|
||||
Ò Ã\x92
|
||||
Ó Ã\x93
|
||||
Ô Ã\x94
|
||||
Õ Ã\x95
|
||||
Ö Ã\x96
|
||||
× Ã\x97
|
||||
Ø Ã\x98
|
||||
Ù Ã\x99
|
||||
Ú Ã\x9a
|
||||
Û Ã\x9b
|
||||
Ü Ã\x9c
|
||||
Ý Ã\x9d
|
||||
Þ Ã\x9e
|
||||
ß Ã\x9f
|
||||
à Ã
|
||||
á á
|
||||
â â
|
||||
ã ã
|
||||
ä ä
|
||||
å Ã¥
|
||||
æ æ
|
||||
ç ç
|
||||
è è
|
||||
é é
|
||||
ê ê
|
||||
ë ë
|
||||
ì ì
|
||||
í í
|
||||
î î
|
||||
ï ï
|
||||
ð ð
|
||||
ñ ñ
|
||||
ò ò
|
||||
ó ó
|
||||
ô ô
|
||||
õ Ãμ
|
||||
ö ö
|
||||
÷ ÷
|
||||
ø ø
|
||||
ù ù
|
||||
ú ú
|
||||
û û
|
||||
ü ü
|
||||
ý ý
|
||||
þ þ
|
||||
ÿ ÿ
|
||||
|
||||

|
||||
™ â\x84¢
|
||||
€ â\x82¬
|
||||
æ æ
|
||||
Œ Å\x92
|
||||
≤ â\x89¤
|
||||
≠ â\x89
|
||||
≥ â\x89¥
|
||||
fi ï¬\x81
|
||||
fl ï¬\x82
|
||||
∞ â\x88\x9e
|
||||
• â\x80¢
|
||||
⁄ â\x81\x84
|
||||
≈ â\x89\x88
|
||||
◊ â\x97\x8a
|
||||
∑ â\x88\x91
|
||||
∏ â\x88\x8f
|
||||
π Ï\x80
|
||||
∂ â\x88\x82
|
||||
∆ â\x88\x86
|
||||
ƒ Æ\x92
|
||||
Ω Î©
|
||||
√ â\x88\x9a
|
||||
∫ â\x88«
|
||||
† â\x80
|
||||
‡ â\x80¡
|
||||
ı ı
|
||||
› â\x80º
|
||||
˚ Ë\x9a
|
||||
˙ Ë\x99
|
||||
ˇ Ë\x87
|
||||
˝ Ë\x9d
|
||||
˛ Ë\x9b
|
||||
‘ â\x80\x98
|
||||
’ â\x80\x99
|
||||
‚ â\x80\x9a
|
||||
“ â\x80\x9c
|
||||
” â\x80\x9d
|
||||
„ â\x80\x9e
|
||||
… â\x80¦
|
||||
— â\x80\x94
|
||||
– â\x80\x93
|
||||
|
||||
CHARSET=utf-8 CHARSET=iso-8859-1
|
||||
CHARSET=utf-8 CHARSET=iso-iso-8859-1
|
||||
"""
|
||||
|
||||
# Just a standard search & replace
|
||||
def fixit(text):
|
||||
for pair in mapping.split('\n'):
|
||||
if not pair: continue
|
||||
repl, patt = pair.split('\t')
|
||||
text = text.replace(patt.strip(), repl.strip())
|
||||
return text
|
||||
|
||||
# User wants to save the file in place or not?
|
||||
write_file = False
|
||||
if len(sys.argv) > 1 and sys.argv[1] == '-w':
|
||||
write_file = True
|
||||
sys.argv.pop(1)
|
||||
|
||||
# The input files (if any)
|
||||
files = sys.argv[1:]
|
||||
|
||||
if files:
|
||||
# Fix input files one by one
|
||||
for this_file in files:
|
||||
try:
|
||||
# Read and fix
|
||||
f = open(this_file, 'r')
|
||||
fixed = fixit(f.read())
|
||||
f.close()
|
||||
|
||||
# Save the file or show on STDOUT
|
||||
if write_file:
|
||||
f = open(this_file, 'w')
|
||||
f.write(fixed)
|
||||
f.close()
|
||||
print "Fixed", this_file
|
||||
else:
|
||||
print fixed,
|
||||
except:
|
||||
print "Error fixing", this_file
|
||||
sys.exit(1)
|
||||
else:
|
||||
# No input file, read from STDIN and send results to STDOUT
|
||||
print fixit(sys.stdin.read()),
|
Loading…
Reference in a new issue