# -*- coding: utf-8 -*-
# file: ZhConversion.py
import re
import zipfile
import codecs
def getUnihan(unihan_fname='Unihan.zip'):
try:
unihanzipfile = zipfile.ZipFile(unihan_fname, 'r')
data = unihanzipfile.read('Unihan.txt')
print('Unihan.zip found.')
except IOError:
print('Unihan.zip not found.')
return False
return data
def getConversionTable(ctable_fname):
try:
conversiontable = codecs.open(ctable_fname, 'r', 'utf-8')
data = conversiontable.read()
print(ctable_fname + ' found.')
except IOError:
print(ctable_fname + ' not found.')
return False
return data
def getConversionTableDiff(ctablediff_fname):
try:
conversiontablediff = codecs.open(ctablediff_fname + '.diff', 'r', 'utf-8')
data = conversiontablediff.read()
except IOError:
data = False
return data
def patchConversionTable(orig, diff, variant):
print(' ' + variant + ".diff found, try to merge with file " + variant + '...')
origlines = orig.splitlines()
origlines[0] = origlines[0].replace(u'\ufeff', '')
difflines = diff.splitlines()
i = 0
for diffline in difflines:
if diffline.startswith('@@'):
i = int(diffline.split()[2].split(',')[0]) - 1
elif diffline.startswith(' '):
if diffline[1:] == origlines[i]:
i += 1
else:
print(' ' + variant + ".diff can't merge with file " + variant + '.')
return False
elif diffline.startswith('+') and not diffline.startswith('+++'):
origlines.insert(i, diffline[1:])
i += 1
elif diffline.startswith('-') and not diffline.startswith('---'):
if diffline[1:] == origlines[i]:
origlines.pop(i)
else:
print(' ' + variant + ".diff can't merge with file " + variant + '.')
return False
orig = '\n'.join(origlines)
return orig
def getDictFromUnihan(variant):
unihanfile = getUnihan(unihan_fname='Unihan.zip')
elems = unihanfile.splitlines()
to = {}
sept = '\t' + variant + '\t'
for elem in elems:
left, sep, right = elem.partition(sept)
if sep == sept:
right = right.split()
right = right[0]
if left != right:
to[ucs4chr(int(left[2:],16))] = ucs4chr(int(right[2:],16))
return to
def getDictFromConversionTable(to, variant):
conversiontable = getConversionTable(ctable_fname = variant)
conversiontablediff = getConversionTableDiff(ctablediff_fname = variant)
if conversiontablediff:
conversiontb = patchConversionTable(conversiontable, conversiontablediff, variant)
if conversiontb:
saveConversionTable(variant, conversiontb)
conversiontable = conversiontb
p = re.compile('-\{([\s\S]*?)\}-')
conversionslist = p.findall(conversiontable)
elems = []
for conversions in conversionslist:
elems += conversions.splitlines()
for elem in elems:
left, sep, right = elem.partition('=>')
if sep == '=>':
left = left.replace('*','').strip()
right = right.partition('//')[0].replace(';','').strip()
if left in to:
if left == right:
to.pop(left)
else:
to[left] = right
else:
to[left] = right
return to
def toHansDict():
toHans = getDictFromUnihan('kSimplifiedVariant')
toHans = getDictFromConversionTable(toHans, 'Zh-hans')
return toHans
def toHantDict():
toHant = getDictFromUnihan('kTraditionalVariant')
toHant = getDictFromConversionTable(toHant, 'Zh-hant')
return toHant
def toOtherDict(variant):
toOther = {}
toOther = getDictFromConversionTable(toOther, variant)
return toOther
def getConversionCode(to):
CString = ''
for left, right in sorted(to.items(), key=lambda d: d[0]):
CString += '"' + left + '" => "' + right + '",\n'
return CString
def saveFile(toHant, toHans, toTW, toHK, toCN, toSG):
CString = u'<?php\n/**\n * Simplified / Traditional Chinese conversion tables\n *' \
+ u'\n * Automatically generated using code and data in includes/zhtable/' \
+ u'\n * Do not modify directly!\n */\n\n'
zhConversion = codecs.open('ZhConversion.php', 'w', 'utf-8')
CString += u'$zh2Hant = array(\n'
CString += getConversionCode(toHant)
CString += u');\n\n$zh2Hans = array(\n'
CString += getConversionCode(toHans)
CString += u');\n\n$zh2TW = array(\n'
CString += getConversionCode(toTW)
CString += u');\n\n$zh2HK = array(\n'
CString += getConversionCode(toHK)
CString += u');\n\n$zh2CN = array(\n'
CString += getConversionCode(toCN)
CString += u');\n\n$zh2SG = array(\n'
CString += getConversionCode(toSG)
CString += u');'
zhConversion.write(CString)
print ('ZhConversion.php created / updated successfully.')
zhConversion.close()
def saveConversionTable(variant, conversiontable):
conversiontablefile = codecs.open(variant + '_new', 'w', 'utf-8')
conversiontablefile.write(conversiontable)
print (' ' + variant + '_new created.')
conversiontablefile.close()
def ucs4chr(codepoint):
try:
return unichr(codepoint)
except ValueError:
hi, lo = divmod (codepoint-0x10000, 0x400)
return unichr(0xd800+hi) + unichr(0xdc00+lo)
def ucs4ord(str):
if len(str)==1:
return ord(str)
if len(str)==2:
hi, lo = ord(str[0])-0xd800, ord(str[1])-0xdc00
return hi*0x400+0x10000
raise TypeError("ucs4ord() expected a valid ucs4 character")
toHant = toHantDict()
toHans = toHansDict()
toTW = toOtherDict('Zh-tw')
toHK = toOtherDict('Zh-hk')
toCN = toOtherDict('Zh-cn')
toSG = toOtherDict('Zh-sg')
saveFile(toHant, toHans, toTW, toHK, toCN, toSG)