Welcome to little lamb

Code » md4c » lila-box » tree

[lila-box] / scripts / build_folding_map.py

#!/usr/bin/env python3

import os
import sys
import textwrap


self_path = os.path.dirname(os.path.realpath(__file__));
f = open(self_path + "/unicode/CaseFolding.txt", "r")

status_list = [ "C", "F" ]

folding_list = [ dict(), dict(), dict() ]

# Filter the foldings for "full" folding.
for line in f:
    comment_off = line.find("#")
    if comment_off >= 0:
        line = line[:comment_off]
    line = line.strip()
    if not line:
        continue

    raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
    if not status.strip() in status_list:
        continue
    codepoint = int(raw_codepoint.strip(), 16)
    mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
    mapping_len = len(mapping)

    if mapping_len in range(1, 4):
        folding_list[mapping_len-1][codepoint] = mapping
    else:
        assert(False)
f.close()


# If we assume that (index0 ... index-1) makes a range (as defined below),
# check that the newly provided index is compatible with the range too; i.e.
# verify that the range can be extended without breaking its properties.
#
# Currently, we can handle ranges which:
#
# (1) either form consecutive sequence of codepoints and which map that range
#     to other consecutive range of codepoints (of the same length);
#
# (2) or a consecutive sequence of codepoints with step 2 where each codepoint
#     CP is mapped to the codepoint CP+1
#     (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...).
#
# Note: When the codepoints in the range are mapped to multiple codepoints,
# only the 1st mapped codepoint is considered. All the other ones have to be
# shared by all the mappings covered by the range.
def is_range_compatible(folding, codepoint_list, index0, index):
    N = index - index0
    codepoint0 = codepoint_list[index0]
    codepoint1 = codepoint_list[index0+1]
    codepointN = codepoint_list[index]
    mapping0 = folding[codepoint0]
    mapping1 = folding[codepoint1]
    mappingN = folding[codepointN]

    # Check the range type (1):
    if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N                \
            and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:]     \
            and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
        return True

    # Check the range type (2):
    if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N            \
            and mapping0[0] - codepoint0 == 1                                       \
            and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:]      \
            and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
        return True

    return False


def mapping_str(list, mapping):
    return ",".join("0x{:04x}".format(x) for x in mapping)

for mapping_len in range(1, 4):
    folding = folding_list[mapping_len-1]
    codepoint_list = list(folding)

    index0 = 0
    count = len(folding)

    records = list()
    data_records = list()

    while index0 < count:
        index1 = index0 + 1
        while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
            index1 += 1

        if index1 - index0 > 2:
            # Range of codepoints
            records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
            data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
            data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
            index0 = index1
        else:
            # Single codepoint
            records.append("S(0x{:04x})".format(codepoint_list[index0]))
            data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
            index0 += 1

    sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
    sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
                        initial_indent = "    ", subsequent_indent="    ")))
    sys.stdout.write("\n};\n")

    sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
    sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
                        initial_indent = "    ", subsequent_indent="    ")))
    sys.stdout.write("\n};\n")