opentx/tools/charset.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# used ? Δ~\n\t

import os

standard_chars = """ !"#$%&'()*+,-./0123456789:;<=>?°ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz~|≥"""

extra_chars = "".join([chr(0x10000+i) for i in range(21)])


def cjk_chars(lang):
    charset = set()
    tools_path = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(tools_path, "../radio/src/translations/%s.h.txt" % lang), encoding='utf-8') as f:
        data = f.read()
        for c in data:
            if 0x4E00 <= ord(c) <= 0x9FFF:
                charset.add(c)
                # print(ord(c))
    result = list(charset)
    result.sort()
    return result


special_chars = {
    "en": "",
    "fr": "éèàîç",
    "de": "ÄäÖöÜüß",
    "cz": "ěščřžýáíéňóůúďťĚŠČŘŽÝÁÍÉŇÓÚŮĎŤ",
    "nl": "",
    "es": "Ññ",
    "fi": "åäöÅÄÖ",
    "it": "àù",
    "pl": "ąćęłńóśżźĄĆĘŁŃÓŚŻŹ",
    "pt": "ÁáÂâÃãÀàÇçÉéÊêÍíÓóÔôÕõÚú",
    "se": "åäöÅÄÖ",
    "cn": "".join(cjk_chars("cn")),
    "tw": "".join(cjk_chars("tw")),
}

subset_lowercase = {
    "Č": "č",
    "Ě": "ě",
    "Š": "š",
    "Ú": "ú",
    "Ů": "ů",
    "Ž": "ž"
}

# print("CN charset: %d symbols" % len(special_chars["cn"]))


def get_chars(subset):
    result = standard_chars + extra_chars
    if False: # subset == "all":
        for key, chars in special_chars.items():
            result += "".join([char for char in chars if char not in result])
    else:
        if subset in special_chars:
            result += "".join([char for char in special_chars[subset] if char not in subset_lowercase])
    return result


def get_chars_encoding(subset):
    result = {}
    if subset in ("cn", "tw"):
        chars = get_chars(subset)
        for char in chars:
            if char in special_chars[subset]:
                index = special_chars[subset].index(char) + 1
                if index >= 0x100:
                    index += 1
                result[char] = "\\%03o\\%03o" % (0xFE + ((index >> 8) & 0x01), index & 0xFF)
            elif char not in standard_chars and char not in extra_chars:
                result[char] = "\\%03o" % (0xC0 + chars.index(char) - len(standard_chars))
    else:
        offset = 128 - len(standard_chars)
        chars = get_chars(subset)
        for char in chars:
            if char not in standard_chars:
                result[char] = "\\%03o" % (offset + chars.index(char))
        for upper, lower in subset_lowercase.items():
            if lower in result:
                result[upper] = result[lower]
    return result