Files
intellecton/venv/lib/python3.12/site-packages/backrefs/uniprops/__init__.py
T

666 lines
22 KiB
Python
Raw Normal View History

"""Unicode Properties."""
from __future__ import annotations
from .unidata import alias
UNICODE_RANGE = '\u0000-\U0010ffff'
ASCII_RANGE = '\x00-\xff'
MODE_NORMAL = 0
MODE_ASCII = 1
MODE_UNICODE = 2
def fmt_string(value: str, is_bytes: bool) -> str:
"""Format for bytes string."""
if is_bytes:
return value[:-1] + '\xff' if value.endswith('\U0010ffff') else value
else:
return value
def get_gc_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `GC` property."""
from .unidata import generalcategory as prop_table
obj = prop_table.ascii_properties if mode != MODE_UNICODE else prop_table.unicode_properties
if value.startswith('^'):
negate = True
value = value[1:]
else:
negate = False
value = alias.unicode_alias['generalcategory'].get(value, value)
is_binary = mode == MODE_ASCII
length = len(value)
if length < 1 or length > 2:
raise ValueError('Invalid property')
elif length == 1 and value not in obj:
raise ValueError('Invalid property')
elif length == 2 and (value[0] not in obj or value[1] not in obj[value[0]]):
raise ValueError('Invalid property')
if not negate:
p1, p2 = (value[0], value[1]) if len(value) > 1 else (value[0], None)
value = ''.join(
[fmt_string(v, is_binary) for k, v in obj.get(p1, {}).items() if not k.startswith('^')]
) if p2 is None else fmt_string(obj.get(p1, {}).get(p2, ''), is_binary)
else:
p1, p2 = (value[0], value[1]) if len(value) > 1 else (value[0], '')
value = fmt_string(obj.get(p1, {}).get('^' + p2, ''), is_binary)
return value
def get_binary_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `BINARY` property."""
from .unidata import binary as prop_table
obj = prop_table.ascii_binary if mode != MODE_UNICODE else prop_table.unicode_binary
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['binary'].get(negated, negated)
else:
value = alias.unicode_alias['binary'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_canonical_combining_class_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `CANONICAL COMBINING CLASS` property."""
from .unidata import canonicalcombiningclass as prop_table
if mode != MODE_UNICODE:
obj = prop_table.ascii_canonical_combining_class
else:
obj = prop_table.unicode_canonical_combining_class
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['canonicalcombiningclass'].get(negated, negated)
else:
value = alias.unicode_alias['canonicalcombiningclass'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_east_asian_width_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `EAST ASIAN WIDTH` property."""
from .unidata import eastasianwidth as prop_table
obj = prop_table.ascii_east_asian_width if mode != MODE_UNICODE else prop_table.unicode_east_asian_width
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['eastasianwidth'].get(negated, negated)
else:
value = alias.unicode_alias['eastasianwidth'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_grapheme_cluster_break_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `GRAPHEME CLUSTER BREAK` property."""
from .unidata import graphemeclusterbreak as prop_table
obj = prop_table.ascii_grapheme_cluster_break if mode != MODE_UNICODE else prop_table.unicode_grapheme_cluster_break
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['graphemeclusterbreak'].get(negated, negated)
else:
value = alias.unicode_alias['graphemeclusterbreak'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_line_break_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `LINE BREAK` property."""
from .unidata import linebreak as prop_table
obj = prop_table.ascii_line_break if mode != MODE_UNICODE else prop_table.unicode_line_break
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['linebreak'].get(negated, negated)
else:
value = alias.unicode_alias['linebreak'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_sentence_break_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `SENTENCE BREAK` property."""
from .unidata import sentencebreak as prop_table
obj = prop_table.ascii_sentence_break if mode != MODE_UNICODE else prop_table.unicode_sentence_break
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['sentencebreak'].get(negated, negated)
else:
value = alias.unicode_alias['sentencebreak'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_word_break_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `WORD BREAK` property."""
from .unidata import wordbreak as prop_table
obj = prop_table.ascii_word_break if mode != MODE_UNICODE else prop_table.unicode_word_break
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['wordbreak'].get(negated, negated)
else:
value = alias.unicode_alias['wordbreak'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_hangul_syllable_type_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `HANGUL SYLLABLE TYPE` property."""
from .unidata import hangulsyllabletype as prop_table
obj = prop_table.ascii_hangul_syllable_type if mode != MODE_UNICODE else prop_table.unicode_hangul_syllable_type
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['hangulsyllabletype'].get(negated, negated)
else:
value = alias.unicode_alias['hangulsyllabletype'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_indic_positional_category_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `INDIC POSITIONAL/MATRA CATEGORY` property."""
from .unidata import indicpositionalcategory as prop_table
if mode != MODE_UNICODE:
obj = prop_table.ascii_indic_positional_category
else:
obj = prop_table.unicode_indic_positional_category
alias_key = 'indicpositionalcategory'
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias[alias_key].get(negated, negated)
else:
value = alias.unicode_alias[alias_key].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_indic_syllabic_category_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `INDIC SYLLABIC CATEGORY` property."""
from .unidata import indicsyllabiccategory as prop_table
if mode != MODE_UNICODE:
obj = prop_table.ascii_indic_syllabic_category
else:
obj = prop_table.unicode_indic_syllabic_category
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['indicsyllabiccategory'].get(negated, negated)
else:
value = alias.unicode_alias['indicsyllabiccategory'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_decomposition_type_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `DECOMPOSITION TYPE` property."""
from .unidata import decompositiontype as prop_table
obj = prop_table.ascii_decomposition_type if mode != MODE_UNICODE else prop_table.unicode_decomposition_type
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['decompositiontype'].get(negated, negated)
else:
value = alias.unicode_alias['decompositiontype'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_nfc_quick_check_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NFC QUICK CHECK` property."""
from .unidata import quickcheck as prop_table
obj = prop_table.ascii_nfc_quick_check if mode != MODE_UNICODE else prop_table.unicode_nfc_quick_check
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['nfcquickcheck'].get(negated, negated)
else:
value = alias.unicode_alias['nfcquickcheck'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_nfd_quick_check_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NFD QUICK CHECK` property."""
from .unidata import quickcheck as prop_table
obj = prop_table.ascii_nfd_quick_check if mode != MODE_UNICODE else prop_table.unicode_nfd_quick_check
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['nfdquickcheck'].get(negated, negated)
else:
value = alias.unicode_alias['nfdquickcheck'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_nfkc_quick_check_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NFKC QUICK CHECK` property."""
from .unidata import quickcheck as prop_table
obj = prop_table.ascii_nfkc_quick_check if mode != MODE_UNICODE else prop_table.unicode_nfkc_quick_check
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['nfkcquickcheck'].get(negated, negated)
else:
value = alias.unicode_alias['nfkcquickcheck'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_nfkd_quick_check_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NFKD QUICK CHECK` property."""
from .unidata import quickcheck as prop_table
obj = prop_table.ascii_nfkd_quick_check if mode != MODE_UNICODE else prop_table.unicode_nfkd_quick_check
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['nfkdquickcheck'].get(negated, negated)
else:
value = alias.unicode_alias['nfkdquickcheck'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_numeric_type_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NUMERIC TYPE` property."""
from .unidata import numerictype as prop_table
obj = prop_table.ascii_numeric_type if mode != MODE_UNICODE else prop_table.unicode_numeric_type
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['numerictype'].get(negated, negated)
else:
value = alias.unicode_alias['numerictype'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_numeric_value_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `NUMERIC VALUE` property."""
from .unidata import numericvalue as prop_table
obj = prop_table.ascii_numeric_values if mode != MODE_UNICODE else prop_table.unicode_numeric_values
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['numericvalue'].get(negated, negated)
else:
value = alias.unicode_alias['numericvalue'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_age_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `AGE` property."""
from .unidata import age as prop_table
obj = prop_table.ascii_age if mode != MODE_UNICODE else prop_table.unicode_age
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['age'].get(negated, negated)
else:
value = alias.unicode_alias['age'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_joining_type_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `JOINING TYPE` property."""
from .unidata import joiningtype as prop_table
obj = prop_table.ascii_joining_type if mode != MODE_UNICODE else prop_table.unicode_joining_type
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['joiningtype'].get(negated, negated)
else:
value = alias.unicode_alias['joiningtype'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_joining_group_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `JOINING GROUP` property."""
from .unidata import joininggroup as prop_table
obj = prop_table.ascii_joining_group if mode != MODE_UNICODE else prop_table.unicode_joining_group
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['joininggroup'].get(negated, negated)
else:
value = alias.unicode_alias['joininggroup'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_script_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `SC` property."""
from .unidata import script as prop_table
obj = prop_table.ascii_scripts if mode != MODE_UNICODE else prop_table.unicode_scripts
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['script'].get(negated, negated)
else:
value = alias.unicode_alias['script'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_script_extension_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `SCX` property."""
from .unidata import scriptextensions as prop_table
obj = prop_table.ascii_script_extensions if mode != MODE_UNICODE else prop_table.unicode_script_extensions
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['script'].get(negated, negated)
else:
value = alias.unicode_alias['script'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_block_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `BLK` property."""
from .unidata import block as prop_table
obj = prop_table.ascii_blocks if mode != MODE_UNICODE else prop_table.unicode_blocks
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['block'].get(negated, negated)
else:
value = alias.unicode_alias['block'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_bidi_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `BC` property."""
from .unidata import bidiclass as prop_table
obj = prop_table.ascii_bidi_classes if mode != MODE_UNICODE else prop_table.unicode_bidi_classes
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['bidiclass'].get(negated, negated)
else:
value = alias.unicode_alias['bidiclass'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_bidi_paired_bracket_type_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `BPT` property."""
from .unidata import bidipairedbrackettype as prop_table
if mode != MODE_UNICODE:
obj = prop_table.ascii_bidi_paired_bracket_type
else:
obj = prop_table.unicode_bidi_paired_bracket_type
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['bidipairedbrackettype'].get(negated, negated)
else:
value = alias.unicode_alias['bidipairedbrackettype'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_vertical_orientation_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get `VO` property."""
from .unidata import verticalorientation as prop_table
if mode != MODE_UNICODE:
obj = prop_table.ascii_vertical_orientation
else:
obj = prop_table.unicode_vertical_orientation
if value.startswith('^'):
negated = value[1:]
value = '^' + alias.unicode_alias['verticalorientation'].get(negated, negated)
else:
value = alias.unicode_alias['verticalorientation'].get(value, value)
return fmt_string(obj[value], mode == MODE_ASCII)
def get_is_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get shortcut for `SC` or `Binary` property."""
from .unidata import scriptextensions as scx
from .unidata import binary
if value.startswith('^'):
prefix = value[1:3]
temp = value[3:]
negate = '^'
else:
prefix = value[:2]
temp = value[2:]
negate = ''
if prefix != 'is':
raise ValueError("Does not start with 'is'!")
script_obj = scx.ascii_script_extensions if mode != MODE_UNICODE else scx.unicode_script_extensions
bin_obj = binary.ascii_binary if mode != MODE_UNICODE else binary.unicode_binary
value = negate + alias.unicode_alias['script'].get(temp, temp)
if value not in script_obj:
value = negate + alias.unicode_alias['binary'].get(temp, temp)
obj = bin_obj
else:
obj = script_obj
return fmt_string(obj[value], mode == MODE_ASCII)
def get_in_property(value: str, mode: int = MODE_UNICODE) -> str:
"""Get shortcut for `Block` property."""
from .unidata import block as prop_table
if value.startswith('^'):
prefix = value[1:3]
temp = value[3:]
negate = '^'
else:
prefix = value[:2]
temp = value[2:]
negate = ''
if prefix != 'in':
raise ValueError("Does not start with 'in'!")
value = negate + alias.unicode_alias['block'].get(temp, temp)
obj = prop_table.ascii_blocks if mode != MODE_UNICODE else prop_table.unicode_blocks
return fmt_string(obj[value], mode == MODE_ASCII)
def _is_binary(name: str) -> bool:
"""Check if name is an enum (not a binary) property."""
from .unidata import binary as prop_table
return name in prop_table.unicode_binary or name in alias.unicode_alias['binary']
def get_unicode_property(prop: str, value: str | None = None, mode: int = MODE_UNICODE) -> str:
"""Retrieve the Unicode category from the table."""
if value is not None:
negate = prop.startswith('^')
# Normalize binary true/false input so we can handle it properly
if _is_binary(prop):
name = prop[1:] if negate else prop
if value in ('n', 'no', 'f', 'false'):
negate = not negate
elif value not in ('y', 'yes', 't', 'true'):
raise ValueError(f"'{value}' is not a valid value for the binary property '{prop}'")
return get_binary_property('^' + name if negate else name, mode)
else:
if negate:
value = '^' + value
name = prop[1:]
else:
name = prop
name = alias.unicode_alias['_'].get(name, name)
try:
if name == 'generalcategory':
return get_gc_property(value, mode)
elif name == 'script':
return get_script_property(value, mode)
elif name == 'scriptextensions':
return get_script_extension_property(value, mode)
elif name == 'block':
return get_block_property(value, mode)
elif name == 'bidiclass':
return get_bidi_property(value, mode)
elif name == 'bidipairedbrackettype':
return get_bidi_paired_bracket_type_property(value, mode)
elif name == 'age':
return get_age_property(value, mode)
elif name == 'eastasianwidth':
return get_east_asian_width_property(value, mode)
elif name == 'indicpositionalcategory':
return get_indic_positional_category_property(value, mode)
elif name == 'indicsyllabiccategory':
return get_indic_syllabic_category_property(value, mode)
elif name == 'hangulsyllabletype':
return get_hangul_syllable_type_property(value, mode)
elif name == 'decompositiontype':
return get_decomposition_type_property(value, mode)
elif name == 'canonicalcombiningclass':
return get_canonical_combining_class_property(value, mode)
elif name == 'numerictype':
return get_numeric_type_property(value, mode)
elif name == 'numericvalue':
return get_numeric_value_property(value, mode)
elif name == 'joiningtype':
return get_joining_type_property(value, mode)
elif name == 'joininggroup':
return get_joining_group_property(value, mode)
elif name == 'graphemeclusterbreak':
return get_grapheme_cluster_break_property(value, mode)
elif name == 'linebreak':
return get_line_break_property(value, mode)
elif name == 'sentencebreak':
return get_sentence_break_property(value, mode)
elif name == 'wordbreak':
return get_word_break_property(value, mode)
elif name == 'nfcquickcheck':
return get_nfc_quick_check_property(value, mode)
elif name == 'nfdquickcheck':
return get_nfd_quick_check_property(value, mode)
elif name == 'nfkcquickcheck':
return get_nfkc_quick_check_property(value, mode)
elif name == 'nfkdquickcheck':
return get_nfkd_quick_check_property(value, mode)
elif name == 'verticalorientation':
return get_vertical_orientation_property(value, mode)
else:
raise ValueError(f"'{prop}={value}' does not have a valid property name")
except Exception as e:
raise ValueError(f"'{prop}={value}' does not appear to be a valid property") from e
try:
return get_gc_property(prop, mode)
except Exception:
pass
try:
return get_script_extension_property(prop, mode)
except Exception:
pass
try:
return get_binary_property(prop, mode)
except Exception:
pass
try:
return get_block_property(prop, mode)
except Exception:
pass
try:
return get_is_property(prop, mode)
except Exception:
pass
try:
return get_in_property(prop, mode)
except Exception:
pass
raise ValueError(f"'{prop}' does not appear to be a valid property")