Initial commit: Linux Transcriber app with multi-language and auto-detection support
This commit is contained in:
45
venv/lib/python3.12/site-packages/idna/__init__.py
Normal file
45
venv/lib/python3.12/site-packages/idna/__init__.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from .core import (
|
||||
IDNABidiError,
|
||||
IDNAError,
|
||||
InvalidCodepoint,
|
||||
InvalidCodepointContext,
|
||||
alabel,
|
||||
check_bidi,
|
||||
check_hyphen_ok,
|
||||
check_initial_combiner,
|
||||
check_label,
|
||||
check_nfc,
|
||||
decode,
|
||||
encode,
|
||||
ulabel,
|
||||
uts46_remap,
|
||||
valid_contextj,
|
||||
valid_contexto,
|
||||
valid_label_length,
|
||||
valid_string_length,
|
||||
)
|
||||
from .intranges import intranges_contain
|
||||
from .package_data import __version__
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"IDNABidiError",
|
||||
"IDNAError",
|
||||
"InvalidCodepoint",
|
||||
"InvalidCodepointContext",
|
||||
"alabel",
|
||||
"check_bidi",
|
||||
"check_hyphen_ok",
|
||||
"check_initial_combiner",
|
||||
"check_label",
|
||||
"check_nfc",
|
||||
"decode",
|
||||
"encode",
|
||||
"intranges_contain",
|
||||
"ulabel",
|
||||
"uts46_remap",
|
||||
"valid_contextj",
|
||||
"valid_contexto",
|
||||
"valid_label_length",
|
||||
"valid_string_length",
|
||||
]
|
||||
6
venv/lib/python3.12/site-packages/idna/__main__.py
Normal file
6
venv/lib/python3.12/site-packages/idna/__main__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
import sys
|
||||
|
||||
from .cli import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
128
venv/lib/python3.12/site-packages/idna/cli.py
Normal file
128
venv/lib/python3.12/site-packages/idna/cli.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Command-line interface for the :mod:`idna` package.
|
||||
|
||||
Invoked via ``python -m idna``. See :func:`main` for the entry point.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from collections.abc import Iterable
|
||||
from itertools import chain
|
||||
from typing import IO, Optional
|
||||
|
||||
from . import IDNAError, decode, encode
|
||||
from .core import _alabel_prefix, _unicode_dots_re
|
||||
from .package_data import __version__
|
||||
|
||||
|
||||
def _looks_like_alabel(s: str) -> bool:
|
||||
"""Return True if any label in ``s`` carries the ``xn--`` ACE prefix."""
|
||||
prefix = _alabel_prefix.decode("ascii")
|
||||
return any(label.lower().startswith(prefix) for label in _unicode_dots_re.split(s))
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="python -m idna",
|
||||
description=(
|
||||
"Convert a domain name between its Unicode (U-label) and "
|
||||
"ASCII-compatible (A-label) forms. With no mode flag, the "
|
||||
"direction is chosen from the first input — if it contains "
|
||||
"an xn-- label the stream is decoded, otherwise it is "
|
||||
"encoded — and the same mode is applied to every remaining "
|
||||
"input. UTS #46 mapping is applied by default; pass "
|
||||
"--strict to disable it. When no domains are given on the "
|
||||
"command line and stdin is piped, one domain per line is "
|
||||
"read from stdin."
|
||||
),
|
||||
)
|
||||
mode = parser.add_mutually_exclusive_group()
|
||||
mode.add_argument(
|
||||
"-e",
|
||||
"--encode",
|
||||
dest="mode",
|
||||
action="store_const",
|
||||
const="encode",
|
||||
help="Encode the input to its ASCII A-label form.",
|
||||
)
|
||||
mode.add_argument(
|
||||
"-d",
|
||||
"--decode",
|
||||
dest="mode",
|
||||
action="store_const",
|
||||
const="decode",
|
||||
help="Decode the input from its ASCII A-label form.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strict",
|
||||
action="store_true",
|
||||
help="Disable the default UTS #46 mapping and apply IDNA 2008 rules verbatim.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"idna {__version__}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"domain",
|
||||
nargs="*",
|
||||
help="One or more domain names to convert. Omit to read from stdin.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def _iter_stdin(stream: IO[str]) -> Iterable[str]:
|
||||
"""Yield non-empty stripped lines from ``stream``, ignoring blanks."""
|
||||
for line in stream:
|
||||
stripped = line.strip()
|
||||
if stripped:
|
||||
yield stripped
|
||||
|
||||
|
||||
def _convert_one(domain: str, mode: str, uts46: bool) -> bool:
|
||||
"""Convert ``domain`` and write the result; return ``False`` on failure."""
|
||||
try:
|
||||
if mode == "decode":
|
||||
print(decode(domain, uts46=uts46))
|
||||
else:
|
||||
print(encode(domain, uts46=uts46).decode("ascii"))
|
||||
except IDNAError as err:
|
||||
print(f"idna: {mode} failed for {domain!r}: {err}", file=sys.stderr)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
"""Entry point for ``python -m idna``.
|
||||
|
||||
When more than one domain is supplied (via positional arguments or
|
||||
piped stdin) and no mode flag is given, the first input determines
|
||||
the direction and that mode is applied uniformly to the rest.
|
||||
|
||||
:param argv: Argument list excluding the program name. Defaults to
|
||||
:data:`sys.argv` when ``None``.
|
||||
:returns: ``0`` on success, ``1`` if any conversion fails.
|
||||
"""
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
uts46 = not args.strict
|
||||
|
||||
if args.domain:
|
||||
domains: Iterable[str] = args.domain
|
||||
elif not sys.stdin.isatty():
|
||||
domains = _iter_stdin(sys.stdin)
|
||||
else:
|
||||
parser.error("a domain argument is required when stdin is a terminal")
|
||||
|
||||
iterator = iter(domains)
|
||||
first = next(iterator, None)
|
||||
if first is None:
|
||||
return 0
|
||||
|
||||
mode = args.mode or ("decode" if _looks_like_alabel(first) else "encode")
|
||||
|
||||
results = [_convert_one(domain, mode, uts46) for domain in chain([first], iterator)]
|
||||
return 0 if all(results) else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
159
venv/lib/python3.12/site-packages/idna/codec.py
Normal file
159
venv/lib/python3.12/site-packages/idna/codec.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import codecs
|
||||
from typing import Any, Optional
|
||||
|
||||
from .core import IDNAError, _unicode_dots_re, alabel, decode, encode, ulabel
|
||||
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
"""Stateless IDNA 2008 codec.
|
||||
|
||||
Implements the :class:`codecs.Codec` protocol so that the whole-domain
|
||||
encoder (:func:`idna.encode`) and decoder (:func:`idna.decode`) are
|
||||
accessible through the standard codec machinery as ``"idna2008"``.
|
||||
|
||||
Only the ``"strict"`` error handler is supported; any other handler
|
||||
raises :exc:`~idna.IDNAError`.
|
||||
"""
|
||||
|
||||
def encode(self, data: str, errors: str = "strict") -> tuple[bytes, int]: # ty: ignore[invalid-method-override]
|
||||
if errors != "strict":
|
||||
raise IDNAError(f'Unsupported error handling "{errors}"')
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
return encode(data), len(data)
|
||||
|
||||
def decode(self, data: bytes, errors: str = "strict") -> tuple[str, int]: # ty: ignore[invalid-method-override]
|
||||
if errors != "strict":
|
||||
raise IDNAError(f'Unsupported error handling "{errors}"')
|
||||
|
||||
if not data:
|
||||
return "", 0
|
||||
|
||||
return decode(data), len(data)
|
||||
|
||||
|
||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
||||
"""Incremental IDNA 2008 encoder.
|
||||
|
||||
Buffers a partial trailing label across calls until either the next
|
||||
label separator is seen or ``final=True``, so that streamed input is
|
||||
encoded one whole label at a time. Any of the four Unicode label
|
||||
separators (``U+002E``, ``U+3002``, ``U+FF0E``, ``U+FF61``) ends a
|
||||
label; the result always uses ``U+002E`` as the separator.
|
||||
|
||||
Only the ``"strict"`` error handler is supported.
|
||||
"""
|
||||
|
||||
def _buffer_encode(self, data: str, errors: str, final: bool) -> tuple[bytes, int]: # ty: ignore[invalid-method-override]
|
||||
if errors != "strict":
|
||||
raise IDNAError(f'Unsupported error handling "{errors}"')
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = b""
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = b"."
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = b"."
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(alabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
# Join with U+002E
|
||||
result_bytes = b".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return result_bytes, size
|
||||
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
"""Incremental IDNA 2008 decoder.
|
||||
|
||||
Buffers a partial trailing label across calls until either the next
|
||||
label separator is seen or ``final=True``, so that streamed input is
|
||||
decoded one whole label at a time.
|
||||
|
||||
Only the ``"strict"`` error handler is supported.
|
||||
"""
|
||||
|
||||
def _buffer_decode(self, data: Any, errors: str, final: bool) -> tuple[str, int]: # ty: ignore[invalid-method-override]
|
||||
if errors != "strict":
|
||||
raise IDNAError(f'Unsupported error handling "{errors}"')
|
||||
|
||||
if not data:
|
||||
return ("", 0)
|
||||
|
||||
if not isinstance(data, str):
|
||||
data = str(data, "ascii")
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ""
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = "."
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = "."
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(ulabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
result_str = ".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result_str, size)
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
|
||||
def search_function(name: str) -> Optional[codecs.CodecInfo]:
|
||||
"""Codec search function registered with :mod:`codecs`.
|
||||
|
||||
Returns a :class:`codecs.CodecInfo` for the ``"idna2008"`` codec name
|
||||
so that ``str.encode("idna2008")`` and ``bytes.decode("idna2008")``
|
||||
invoke the IDNA 2008 codec defined in this module.
|
||||
|
||||
:param name: The codec name being looked up.
|
||||
:returns: A :class:`codecs.CodecInfo` instance if ``name`` is
|
||||
``"idna2008"``, otherwise ``None``.
|
||||
"""
|
||||
if name != "idna2008":
|
||||
return None
|
||||
return codecs.CodecInfo(
|
||||
name=name,
|
||||
encode=Codec().encode,
|
||||
decode=Codec().decode, # type: ignore
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
)
|
||||
|
||||
|
||||
codecs.register(search_function)
|
||||
41
venv/lib/python3.12/site-packages/idna/compat.py
Normal file
41
venv/lib/python3.12/site-packages/idna/compat.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import Any, Union
|
||||
|
||||
from .core import decode, encode
|
||||
|
||||
|
||||
def ToASCII(label: str) -> bytes:
|
||||
"""Compatibility shim for :rfc:`3490` ``ToASCII``.
|
||||
|
||||
Delegates to :func:`idna.encode` (IDNA 2008). Provided to ease porting
|
||||
of code written against the legacy :mod:`encodings.idna` API; new code
|
||||
should call :func:`idna.encode` directly.
|
||||
|
||||
:param label: The label or domain to encode.
|
||||
:returns: The encoded form as ASCII :class:`bytes`.
|
||||
"""
|
||||
return encode(label)
|
||||
|
||||
|
||||
def ToUnicode(label: Union[bytes, bytearray]) -> str:
|
||||
"""Compatibility shim for :rfc:`3490` ``ToUnicode``.
|
||||
|
||||
Delegates to :func:`idna.decode` (IDNA 2008). Provided to ease porting
|
||||
of code written against the legacy :mod:`encodings.idna` API; new code
|
||||
should call :func:`idna.decode` directly.
|
||||
|
||||
:param label: The label or domain to decode.
|
||||
:returns: The decoded Unicode form.
|
||||
"""
|
||||
return decode(label)
|
||||
|
||||
|
||||
def nameprep(s: Any) -> None:
|
||||
"""Stub for :rfc:`3491` Nameprep, which is not used by IDNA 2008.
|
||||
|
||||
IDNA 2008 (:rfc:`5891`) replaces Nameprep with the per-codepoint
|
||||
validity classes from :rfc:`5892`; this function exists only to
|
||||
return a clear error if legacy code attempts to call it.
|
||||
|
||||
:raises NotImplementedError: Always.
|
||||
"""
|
||||
raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")
|
||||
648
venv/lib/python3.12/site-packages/idna/core.py
Normal file
648
venv/lib/python3.12/site-packages/idna/core.py
Normal file
@@ -0,0 +1,648 @@
|
||||
import bisect
|
||||
import re
|
||||
import unicodedata
|
||||
import warnings
|
||||
from typing import Optional, Union
|
||||
|
||||
from . import idnadata
|
||||
from .intranges import intranges_contain
|
||||
|
||||
_virama_combining_class = 9
|
||||
_alabel_prefix = b"xn--"
|
||||
_max_input_length = 1024
|
||||
_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
|
||||
|
||||
|
||||
# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop
|
||||
_bidi_rtl_first = frozenset({"R", "AL"})
|
||||
_bidi_rtl_categories = frozenset({"R", "AL", "AN"})
|
||||
_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
|
||||
_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})
|
||||
_bidi_rtl_numeric = frozenset({"AN", "EN"})
|
||||
_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
|
||||
_bidi_ltr_valid_ending = frozenset({"L", "EN"})
|
||||
_bidi_joiner_l_or_d = frozenset({"L", "D"})
|
||||
_bidi_joiner_r_or_d = frozenset({"R", "D"})
|
||||
|
||||
|
||||
def _joining_type(cp: int) -> Optional[str]:
|
||||
for jt, ranges in idnadata.joining_types.items():
|
||||
if intranges_contain(cp, ranges):
|
||||
return jt
|
||||
return None
|
||||
|
||||
|
||||
class IDNAError(UnicodeError):
|
||||
"""Base exception for all IDNA-encoding related problems"""
|
||||
|
||||
|
||||
class IDNABidiError(IDNAError):
|
||||
"""Exception when bidirectional requirements are not satisfied"""
|
||||
|
||||
|
||||
class InvalidCodepoint(IDNAError):
|
||||
"""Exception when a disallowed or unallocated codepoint is used"""
|
||||
|
||||
|
||||
class InvalidCodepointContext(IDNAError):
|
||||
"""Exception when the codepoint is not valid in the context it is used"""
|
||||
|
||||
|
||||
def _combining_class(cp: int) -> int:
|
||||
v = unicodedata.combining(chr(cp))
|
||||
if v == 0 and not unicodedata.name(chr(cp)):
|
||||
raise ValueError("Unknown character in unicodedata")
|
||||
return v
|
||||
|
||||
|
||||
def _is_script(cp: str, script: str) -> bool:
|
||||
return intranges_contain(ord(cp), idnadata.scripts[script])
|
||||
|
||||
|
||||
def _punycode(s: str) -> bytes:
|
||||
return s.encode("punycode")
|
||||
|
||||
|
||||
def _unot(s: int) -> str:
|
||||
return f"U+{s:04X}"
|
||||
|
||||
|
||||
def valid_label_length(label: Union[bytes, str]) -> bool:
|
||||
"""Check that a label does not exceed the maximum permitted length.
|
||||
|
||||
Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed
|
||||
63 octets. The argument may be either a :class:`str` (a U-label, where
|
||||
length is measured in characters) or :class:`bytes` (an A-label, where
|
||||
length is measured in octets).
|
||||
|
||||
:param label: The label to check.
|
||||
:returns: ``True`` if the label is within the length limit, otherwise
|
||||
``False``.
|
||||
"""
|
||||
return len(label) <= 63
|
||||
|
||||
|
||||
def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:
|
||||
"""Check that a full domain name does not exceed the maximum length.
|
||||
|
||||
Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing
|
||||
dot is present, or 254 octets when one is included.
|
||||
|
||||
:param domain: The full (possibly multi-label) domain name.
|
||||
:param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.
|
||||
:returns: ``True`` if the domain is within the length limit, otherwise
|
||||
``False``.
|
||||
"""
|
||||
return len(domain) <= (254 if trailing_dot else 253)
|
||||
|
||||
|
||||
def check_bidi(label: str, check_ltr: bool = False) -> bool:
|
||||
"""Validate the Bidi Rule from :rfc:`5893` for a single label.
|
||||
|
||||
The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,
|
||||
etc.) may appear within a label. By default the check is only applied
|
||||
when the label contains at least one right-to-left character (Unicode
|
||||
bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``
|
||||
to ``True`` to apply it to LTR-only labels as well.
|
||||
|
||||
:param label: The label to validate, as a Unicode string.
|
||||
:param check_ltr: If ``True``, apply the rules even when the label
|
||||
contains no RTL characters.
|
||||
:returns: ``True`` if the label satisfies the Bidi Rule.
|
||||
:raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,
|
||||
or if the directional category of a codepoint cannot be determined.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
# Bidi rules should only be applied if string contains RTL characters
|
||||
bidi_label = False
|
||||
for idx, cp in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
if direction == "":
|
||||
# String likely comes from a newer version of Unicode
|
||||
raise IDNABidiError(f"Unknown directionality in label {label!r} at position {idx}")
|
||||
if direction in _bidi_rtl_categories:
|
||||
bidi_label = True
|
||||
if not bidi_label and not check_ltr:
|
||||
return True
|
||||
|
||||
# Bidi rule 1
|
||||
direction = unicodedata.bidirectional(label[0])
|
||||
if direction in _bidi_rtl_first:
|
||||
rtl = True
|
||||
elif direction == "L":
|
||||
rtl = False
|
||||
else:
|
||||
raise IDNABidiError(f"First codepoint in label {label!r} must be directionality L, R or AL")
|
||||
|
||||
valid_ending = False
|
||||
number_type: Optional[str] = None
|
||||
for idx, cp in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
|
||||
if rtl:
|
||||
# Bidi rule 2
|
||||
if direction not in _bidi_rtl_allowed:
|
||||
raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")
|
||||
# Bidi rule 3
|
||||
if direction in _bidi_rtl_valid_ending:
|
||||
valid_ending = True
|
||||
elif direction != "NSM":
|
||||
valid_ending = False
|
||||
# Bidi rule 4
|
||||
if direction in _bidi_rtl_numeric:
|
||||
if not number_type:
|
||||
number_type = direction
|
||||
elif number_type != direction:
|
||||
raise IDNABidiError("Can not mix numeral types in a right-to-left label")
|
||||
else:
|
||||
# Bidi rule 5
|
||||
if direction not in _bidi_ltr_allowed:
|
||||
raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")
|
||||
# Bidi rule 6
|
||||
if direction in _bidi_ltr_valid_ending:
|
||||
valid_ending = True
|
||||
elif direction != "NSM":
|
||||
valid_ending = False
|
||||
|
||||
if not valid_ending:
|
||||
raise IDNABidiError("Label ends with illegal codepoint directionality")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_initial_combiner(label: str) -> bool:
|
||||
"""Reject labels that begin with a combining mark.
|
||||
|
||||
Per :rfc:`5891` §4.2.3.2 a label must not start with a character of
|
||||
Unicode general category ``M`` (Mark).
|
||||
|
||||
:param label: The label to check.
|
||||
:returns: ``True`` if the first character is not a combining mark.
|
||||
:raises IDNAError: If the label begins with a combining character.
|
||||
"""
|
||||
if unicodedata.category(label[0])[0] == "M":
|
||||
raise IDNAError("Label begins with an illegal combining character")
|
||||
return True
|
||||
|
||||
|
||||
def check_hyphen_ok(label: str) -> bool:
|
||||
"""Validate the hyphen restrictions for a label.
|
||||
|
||||
Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen
|
||||
(``U+002D``), and must not have hyphens in both the third and fourth
|
||||
positions (the prefix reserved for A-labels).
|
||||
|
||||
:param label: The label to check.
|
||||
:returns: ``True`` if the hyphen restrictions are satisfied.
|
||||
:raises IDNAError: If any of the hyphen restrictions are violated.
|
||||
"""
|
||||
if label[2:4] == "--":
|
||||
raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
|
||||
if label[0] == "-" or label[-1] == "-":
|
||||
raise IDNAError("Label must not start or end with a hyphen")
|
||||
return True
|
||||
|
||||
|
||||
def check_nfc(label: str) -> None:
|
||||
"""Require that a label is in Unicode Normalization Form C.
|
||||
|
||||
:param label: The label to check.
|
||||
:raises IDNAError: If ``label`` differs from its NFC normalisation.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
if unicodedata.normalize("NFC", label) != label:
|
||||
raise IDNAError("Label must be in Normalization Form C")
|
||||
|
||||
|
||||
def valid_contextj(label: str, pos: int) -> bool:
|
||||
"""Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.
|
||||
|
||||
These rules govern the contextual use of the joiner codepoints
|
||||
``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``
|
||||
(ZERO WIDTH JOINER, Appendix A.2) within a label.
|
||||
|
||||
:param label: The label containing the codepoint.
|
||||
:param pos: Index of the joiner codepoint within ``label``.
|
||||
:returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ
|
||||
rule, ``False`` otherwise (including when the codepoint at
|
||||
``pos`` is not a recognised joiner).
|
||||
:raises ValueError: If an adjacent codepoint has no Unicode name when
|
||||
determining its combining class.
|
||||
:raises IDNAError: If ``label`` exceeds the defensive input length limit.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x200C:
|
||||
if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
|
||||
ok = False
|
||||
for i in range(pos - 1, -1, -1):
|
||||
joining_type = _joining_type(ord(label[i]))
|
||||
if joining_type == "T":
|
||||
continue
|
||||
if joining_type in _bidi_joiner_l_or_d:
|
||||
ok = True
|
||||
break
|
||||
break
|
||||
|
||||
if not ok:
|
||||
return False
|
||||
|
||||
ok = False
|
||||
for i in range(pos + 1, len(label)):
|
||||
joining_type = _joining_type(ord(label[i]))
|
||||
if joining_type == "T":
|
||||
continue
|
||||
if joining_type in _bidi_joiner_r_or_d:
|
||||
ok = True
|
||||
break
|
||||
break
|
||||
return ok
|
||||
|
||||
if cp_value == 0x200D:
|
||||
return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
|
||||
"""Validate the CONTEXTO rules from :rfc:`5892` Appendix A.
|
||||
|
||||
Covers the contextual rules for codepoints such as MIDDLE DOT
|
||||
(``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana
|
||||
middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.
|
||||
|
||||
:param label: The label containing the codepoint.
|
||||
:param pos: Index of the codepoint within ``label``.
|
||||
:param exception: Reserved for forward compatibility; currently unused.
|
||||
:returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO
|
||||
rule, ``False`` otherwise (including when the codepoint is not a
|
||||
recognised CONTEXTO codepoint).
|
||||
:raises IDNAError: If ``label`` exceeds the defensive input length limit.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x00B7:
|
||||
return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C
|
||||
|
||||
if cp_value == 0x0375:
|
||||
if pos < len(label) - 1 and len(label) > 1:
|
||||
return _is_script(label[pos + 1], "Greek")
|
||||
return False
|
||||
|
||||
if cp_value in {0x05F3, 0x05F4}:
|
||||
if pos > 0:
|
||||
return _is_script(label[pos - 1], "Hebrew")
|
||||
return False
|
||||
|
||||
if cp_value == 0x30FB:
|
||||
for cp in label:
|
||||
if cp == "\u30fb":
|
||||
continue
|
||||
if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
|
||||
return True
|
||||
return False
|
||||
|
||||
if 0x660 <= cp_value <= 0x669:
|
||||
return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)
|
||||
|
||||
if 0x6F0 <= cp_value <= 0x6F9:
|
||||
return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_label(label: Union[str, bytes, bytearray]) -> None:
|
||||
"""Run the full set of IDNA 2008 validity checks on a single label.
|
||||
|
||||
Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen
|
||||
restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule
|
||||
(:func:`check_initial_combiner`), per-codepoint validity (PVALID,
|
||||
CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule
|
||||
(:func:`check_bidi`).
|
||||
|
||||
:param label: The label to validate. ``bytes`` or ``bytearray`` input
|
||||
is decoded as UTF-8 first.
|
||||
:raises IDNAError: If the label is empty or fails a structural rule.
|
||||
:raises InvalidCodepoint: If the label contains a DISALLOWED or
|
||||
UNASSIGNED codepoint.
|
||||
:raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint
|
||||
is not valid in its context.
|
||||
:raises IDNABidiError: If the Bidi Rule is violated.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
if isinstance(label, (bytes, bytearray)):
|
||||
label = label.decode("utf-8")
|
||||
if len(label) == 0:
|
||||
raise IDNAError("Empty Label")
|
||||
|
||||
# Reject on domain length rather than label length so support some UTS 46
|
||||
# use cases, still reducing processing of label contextual rules
|
||||
if not valid_string_length(label, trailing_dot=True):
|
||||
raise IDNAError("Label too long")
|
||||
|
||||
check_nfc(label)
|
||||
check_hyphen_ok(label)
|
||||
check_initial_combiner(label)
|
||||
|
||||
for pos, cp in enumerate(label):
|
||||
cp_value = ord(cp)
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
|
||||
continue
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
|
||||
try:
|
||||
if not valid_contextj(label, pos):
|
||||
raise InvalidCodepointContext(f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
|
||||
except ValueError as err:
|
||||
raise IDNAError(
|
||||
f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {label!r}"
|
||||
) from err
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
|
||||
if not valid_contexto(label, pos):
|
||||
raise InvalidCodepointContext(f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
|
||||
else:
|
||||
raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {label!r} not allowed")
|
||||
|
||||
check_bidi(label)
|
||||
|
||||
|
||||
def alabel(label: str) -> bytes:
|
||||
"""Convert a single U-label into its A-label form.
|
||||
|
||||
The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`
|
||||
§4: the label is validated, Punycode-encoded, and prefixed with
|
||||
``xn--``. Pure ASCII labels that are already valid IDNA labels are
|
||||
returned unchanged (as :class:`bytes`).
|
||||
|
||||
:param label: The label to convert, as a Unicode string.
|
||||
:returns: The A-label as ASCII-encoded :class:`bytes`.
|
||||
:raises IDNAError: If the label is invalid or the resulting A-label
|
||||
exceeds 63 octets.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
try:
|
||||
label_bytes = label.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
else:
|
||||
ulabel(label_bytes)
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError("Label too long")
|
||||
return label_bytes
|
||||
|
||||
check_label(label)
|
||||
label_bytes = _alabel_prefix + _punycode(label)
|
||||
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError("Label too long")
|
||||
|
||||
return label_bytes
|
||||
|
||||
|
||||
def ulabel(label: Union[str, bytes, bytearray]) -> str:
|
||||
"""Convert a single A-label into its U-label form.
|
||||
|
||||
Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is
|
||||
Punycode-decoded and validated. Labels that are already Unicode (or
|
||||
plain ASCII without the ACE prefix) are validated and returned as a
|
||||
Unicode string.
|
||||
|
||||
:param label: The label to convert. ``bytes`` or ``bytearray`` input
|
||||
is treated as ASCII.
|
||||
:returns: The U-label as a Unicode string.
|
||||
:raises IDNAError: If the label is malformed or fails validation.
|
||||
"""
|
||||
if len(label) > _max_input_length:
|
||||
raise IDNAError("Label too long")
|
||||
if not isinstance(label, (bytes, bytearray)):
|
||||
try:
|
||||
label_bytes = label.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
check_label(label)
|
||||
return label
|
||||
else:
|
||||
label_bytes = bytes(label)
|
||||
|
||||
label_bytes = label_bytes.lower()
|
||||
if label_bytes.startswith(_alabel_prefix):
|
||||
label_bytes = label_bytes[len(_alabel_prefix) :]
|
||||
if not label_bytes:
|
||||
raise IDNAError("Malformed A-label, no Punycode eligible content found")
|
||||
if label_bytes.endswith(b"-"):
|
||||
raise IDNAError("A-label must not end with a hyphen")
|
||||
else:
|
||||
check_label(label_bytes)
|
||||
return label_bytes.decode("ascii")
|
||||
|
||||
try:
|
||||
label = label_bytes.decode("punycode")
|
||||
except UnicodeError as err:
|
||||
raise IDNAError("Invalid A-label") from err
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
|
||||
def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
|
||||
"""Apply the UTS #46 character mapping to a domain string.
|
||||
|
||||
Implements the mapping table from `UTS #46 §4
|
||||
<https://www.unicode.org/reports/tr46/>`_: each character is kept,
|
||||
replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,
|
||||
``I``). The result is returned in Normalisation Form C.
|
||||
|
||||
:param domain: The full domain name to remap.
|
||||
:param std3_rules: If ``True``, apply the stricter STD3 ASCII rules
|
||||
(status ``3`` codepoints raise instead of being kept or mapped).
|
||||
:param transitional: If ``True``, use transitional processing (status
|
||||
``D`` codepoints are mapped instead of kept). Transitional
|
||||
processing has been removed from UTS #46 and this option is
|
||||
retained only for backwards compatibility.
|
||||
:returns: The remapped domain, in Normalisation Form C.
|
||||
:raises InvalidCodepoint: If the domain contains a disallowed
|
||||
codepoint under the chosen rules.
|
||||
:raises IDNAError: If ``domain`` exceeds the defensive input length limit.
|
||||
"""
|
||||
if len(domain) > _max_input_length:
|
||||
raise IDNAError("Domain too long")
|
||||
from .uts46data import uts46_replacements, uts46_starts, uts46_statuses
|
||||
|
||||
output = ""
|
||||
|
||||
for pos, char in enumerate(domain):
|
||||
code_point = ord(char)
|
||||
i = code_point if code_point < 256 else bisect.bisect_right(uts46_starts, code_point) - 1
|
||||
status = chr(uts46_statuses[i])
|
||||
replacement: Optional[str] = uts46_replacements[i]
|
||||
|
||||
# UTS #46 §4: V is always valid, D is deviation (kept unless transitional),
|
||||
# 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).
|
||||
keep_as_is = (
|
||||
status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)
|
||||
)
|
||||
# M is mapped, 3-with-replacement and transitional D fall through to the
|
||||
# same replacement output path.
|
||||
use_replacement = replacement is not None and (
|
||||
status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
|
||||
)
|
||||
|
||||
if keep_as_is:
|
||||
output += char
|
||||
elif use_replacement:
|
||||
assert replacement is not None # narrowed by use_replacement
|
||||
output += replacement
|
||||
elif status == "I":
|
||||
continue
|
||||
else:
|
||||
raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {domain!r}")
|
||||
|
||||
return unicodedata.normalize("NFC", output)
|
||||
|
||||
|
||||
def encode(
|
||||
s: Union[str, bytes, bytearray],
|
||||
strict: bool = False,
|
||||
uts46: bool = False,
|
||||
std3_rules: bool = False,
|
||||
transitional: bool = False,
|
||||
) -> bytes:
|
||||
"""Encode a Unicode domain name into its ASCII (A-label) form.
|
||||
|
||||
Splits the input on label separators (only ``U+002E`` if ``strict`` is
|
||||
set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL
|
||||
STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),
|
||||
encodes each label with :func:`alabel`, and rejoins them with ``.``.
|
||||
Optionally pre-processes the input through :func:`uts46_remap`.
|
||||
|
||||
:param s: The domain name to encode.
|
||||
:param strict: If ``True``, only ``U+002E`` is recognised as a label
|
||||
separator.
|
||||
:param uts46: If ``True``, apply UTS #46 mapping before encoding.
|
||||
:param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
|
||||
``True``.
|
||||
:param transitional: Forwarded to :func:`uts46_remap` when ``uts46``
|
||||
is ``True``. Deprecated: emits a :class:`DeprecationWarning` and
|
||||
will be removed in a future version.
|
||||
:returns: The encoded domain as ASCII :class:`bytes`.
|
||||
:raises IDNAError: If the domain is empty, contains an invalid label,
|
||||
or exceeds the maximum domain length.
|
||||
"""
|
||||
if transitional:
|
||||
warnings.warn(
|
||||
"Transitional processing has been removed from UTS #46. "
|
||||
"The transitional argument will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
if not isinstance(s, str):
|
||||
try:
|
||||
s = str(s, "ascii")
|
||||
except (UnicodeDecodeError, TypeError) as err:
|
||||
raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err
|
||||
if len(s) > _max_input_length:
|
||||
raise IDNAError("Domain too long")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, transitional)
|
||||
|
||||
# Reject inputs that exceed the maximum DNS domain length up-front
|
||||
# to avoid expensive computation on long inputs.
|
||||
if not valid_string_length(s, trailing_dot=True):
|
||||
raise IDNAError("Domain too long")
|
||||
|
||||
trailing_dot = False
|
||||
result = []
|
||||
labels = s.split(".") if strict else _unicode_dots_re.split(s)
|
||||
if not labels or labels == [""]:
|
||||
raise IDNAError("Empty domain")
|
||||
if labels[-1] == "":
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = alabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError("Empty label")
|
||||
if trailing_dot:
|
||||
result.append(b"")
|
||||
s = b".".join(result)
|
||||
if not valid_string_length(s, trailing_dot):
|
||||
raise IDNAError("Domain too long")
|
||||
return s
|
||||
|
||||
|
||||
def decode(
|
||||
s: Union[str, bytes, bytearray],
|
||||
strict: bool = False,
|
||||
uts46: bool = False,
|
||||
std3_rules: bool = False,
|
||||
display: bool = False,
|
||||
) -> str:
|
||||
"""Decode an A-label-encoded domain name back to Unicode.
|
||||
|
||||
Splits the input on label separators (see :func:`encode` for the
|
||||
rules), decodes each label with :func:`ulabel`, and rejoins them
|
||||
with ``.``. Optionally pre-processes the input through
|
||||
:func:`uts46_remap`.
|
||||
|
||||
:param s: The domain name to decode.
|
||||
:param strict: If ``True``, only ``U+002E`` is recognised as a label
|
||||
separator.
|
||||
:param uts46: If ``True``, apply UTS #46 mapping before decoding.
|
||||
:param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
|
||||
``True``.
|
||||
:param display: If ``True``, any ``xn--`` label that fails IDNA
|
||||
validation is passed through unchanged (lowercased) rather than
|
||||
aborting the whole call. Intended for "decode for display"
|
||||
consumers (e.g. URL libraries, HTTP clients) that want to show
|
||||
the user the label as it appears on the wire when it cannot be
|
||||
rendered as Unicode. Matches the per-label recovery prescribed
|
||||
by UTS #46 §4 and the WHATWG URL "domain to Unicode" algorithm.
|
||||
:returns: The decoded domain as a Unicode string.
|
||||
:raises IDNAError: If the input is not valid ASCII, contains an
|
||||
invalid label, or is empty.
|
||||
"""
|
||||
if not isinstance(s, str):
|
||||
try:
|
||||
s = str(s, "ascii")
|
||||
except (UnicodeDecodeError, TypeError) as err:
|
||||
raise IDNAError("Invalid ASCII in A-label") from err
|
||||
if len(s) > _max_input_length:
|
||||
raise IDNAError("Domain too long")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, False)
|
||||
# Reject inputs that exceed the maximum DNS domain length up-front
|
||||
# to avoid expensive computation on long inputs.
|
||||
if not valid_string_length(s, trailing_dot=True):
|
||||
raise IDNAError("Domain too long")
|
||||
trailing_dot = False
|
||||
result = []
|
||||
labels = s.split(".") if strict else _unicode_dots_re.split(s)
|
||||
if not labels or labels == [""]:
|
||||
raise IDNAError("Empty domain")
|
||||
if not labels[-1]:
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
try:
|
||||
u = ulabel(label)
|
||||
except IDNAError:
|
||||
if display and label[:4].lower() == "xn--":
|
||||
u = label.lower()
|
||||
else:
|
||||
raise
|
||||
if u:
|
||||
result.append(u)
|
||||
else:
|
||||
raise IDNAError("Empty label")
|
||||
if trailing_dot:
|
||||
result.append("")
|
||||
return ".".join(result)
|
||||
1897
venv/lib/python3.12/site-packages/idna/idnadata.py
Normal file
1897
venv/lib/python3.12/site-packages/idna/idnadata.py
Normal file
File diff suppressed because it is too large
Load Diff
55
venv/lib/python3.12/site-packages/idna/intranges.py
Normal file
55
venv/lib/python3.12/site-packages/idna/intranges.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
|
||||
|
||||
def intranges_from_list(list_: list[int]) -> tuple[int, ...]:
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i + 1 < len(sorted_list) and sorted_list[i] == sorted_list[i + 1] - 1:
|
||||
continue
|
||||
current_range = sorted_list[last_write + 1 : i + 1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
|
||||
def _encode_range(start: int, end: int) -> int:
|
||||
return (start << 32) | end
|
||||
|
||||
|
||||
def _decode_range(r: int) -> tuple[int, int]:
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_: int, ranges: tuple[int, ...]) -> bool:
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos - 1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
||||
1
venv/lib/python3.12/site-packages/idna/package_data.py
Normal file
1
venv/lib/python3.12/site-packages/idna/package_data.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "3.18"
|
||||
0
venv/lib/python3.12/site-packages/idna/py.typed
Normal file
0
venv/lib/python3.12/site-packages/idna/py.typed
Normal file
16896
venv/lib/python3.12/site-packages/idna/uts46data.py
Normal file
16896
venv/lib/python3.12/site-packages/idna/uts46data.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user