dotfiles/Utils/fix-htmldoc-utf8.py
Frank Villaro-Dixon 8e94c040c1 fix-htmldoc-utf8
2014-09-16 18:28:22 +02:00

211 lines
4.2 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2
# coding: utf-8
#
# Name : fix-htmldoc-utf8
# Summary: Program to fix UTF-8 characters that HTMLDOC has messed
# Author : Aurelio Jargas www.aurelio.net/soft
# License: BSD
# Release: April, 2008
#
# HTMLDOC has no Unicode support, so when you try to use it in a UTF-8 file,
# all the special characters (not ASCII) will be incorrect in the resulting HTML.
# This program fixes this, restoring the original UTF-8 characters.
#
# Just use it as a filter (reads STDIN, results to STDOUT) or use the -w option
# fix the file in place.
#
# Examples:
# cat myfile.html | fix-htmldoc-utf8 > myfile-ok.html
# fix-htmldoc-utf8 myfile.html > myfile-ok.html
# fix-htmldoc-utf8 -w myfile.html
#
import sys
# You can add new chars to this mapping, if needed.
# The first set are the ISO-8859-1 extended chars.
# The second set are the Unicode chars I've found on my keyboard.
#
mapping = """
¡ ¡
¢ ¢
£ £
¤ ¤
¥ Â¥
¦ ¦
§ §
¨ ¨
© ©
ª ª
« «
¬ ¬
® ®
¯ ¯
° °
± ±
² ²
³ ³
´ ´
µ Âμ
¶ ¶
· ·
¸ ¸
¹ ¹
º º
» »
¼ ¼
½ ½
¾ ¾
¿ ¿
À Ã\x80
Á Ã\x81
 Ã\x82
à Ã\x83
Ä Ã\x84
Å Ã\x85
Æ Ã\x86
Ç Ã\x87
È Ã\x88
É Ã\x89
Ê Ã\x8a
Ë Ã\x8b
Ì Ã\x8c
Í Ã\x8d
Î Ã\x8e
Ï Ã\x8f
Ð Ã\x90
Ñ Ã\x91
Ò Ã\x92
Ó Ã\x93
Ô Ã\x94
Õ Ã\x95
Ö Ã\x96
× Ã\x97
Ø Ã\x98
Ù Ã\x99
Ú Ã\x9a
Û Ã\x9b
Ü Ã\x9c
Ý Ã\x9d
Þ Ã\x9e
ß Ã\x9f
à à
á á
â â
ã ã
ä ä
å Ã¥
æ æ
ç ç
è è
é é
ê ê
ë ë
ì ì
í í
î î
ï ï
ð ð
ñ ñ
ò ò
ó ó
ô ô
õ Ãμ
ö ö
÷ ÷
ø ø
ù ù
ú ú
û û
ü ü
ý ý
þ þ
ÿ ÿ
 
™ â\x84¢
€ â\x82¬
æ æ
Œ Å\x92
≤ â\x89¤
≠ â\x89 
≥ â\x89¥
fi ï¬\x81
fl ï¬\x82
∞ â\x88\x9e
• â\x80¢
â\x81\x84
≈ â\x89\x88
◊ â\x97\x8a
∑ â\x88\x91
∏ â\x88\x8f
π Ï\x80
∂ â\x88\x82
∆ â\x88\x86
ƒ Æ\x92
Ω Î©
√ â\x88\x9a
∫ â\x88«
† â\x80 
‡ â\x80¡
ı ı
â\x80º
˚ Ë\x9a
˙ Ë\x99
ˇ Ë\x87
˝ Ë\x9d
˛ Ë\x9b
â\x80\x98
â\x80\x99
â\x80\x9a
“ â\x80\x9c
” â\x80\x9d
„ â\x80\x9e
… â\x80¦
— â\x80\x94
â\x80\x93
CHARSET=utf-8 CHARSET=iso-8859-1
CHARSET=utf-8 CHARSET=iso-iso-8859-1
"""
# Just a standard search & replace
def fixit(text):
for pair in mapping.split('\n'):
if not pair: continue
repl, patt = pair.split('\t')
text = text.replace(patt.strip(), repl.strip())
return text
# User wants to save the file in place or not?
write_file = False
if len(sys.argv) > 1 and sys.argv[1] == '-w':
write_file = True
sys.argv.pop(1)
# The input files (if any)
files = sys.argv[1:]
if files:
# Fix input files one by one
for this_file in files:
try:
# Read and fix
f = open(this_file, 'r')
fixed = fixit(f.read())
f.close()
# Save the file or show on STDOUT
if write_file:
f = open(this_file, 'w')
f.write(fixed)
f.close()
print "Fixed", this_file
else:
print fixed,
except:
print "Error fixing", this_file
sys.exit(1)
else:
# No input file, read from STDIN and send results to STDOUT
print fixit(sys.stdin.read()),