Source code for pystrich.datamatrix.data

"""DataMatrix-specific composition types and marker constants."""

from __future__ import annotations

import warnings
from typing import Literal

from pystrich.exceptions import (
    DataMatrixNonAsciiWarning,
    Fnc1WorkaroundCompatWarning,
    PyStrichInvalidInput,
    PyStrichInvalidOption,
)

DataMatrixEncoding = Literal["compat", "ascii", "iso-8859-1", "utf-8"]

_ENCODING_RULES: dict[DataMatrixEncoding, tuple[str, int, Literal["warn", "raise"]]] = {
    # encoding name -> (Python codec name, max representable codepoint, on-fail policy)
    "compat": ("ascii", 0x7F, "warn"),
    "ascii": ("ascii", 0x7F, "raise"),
    "iso-8859-1": ("iso-8859-1", 0xFF, "raise"),
    "utf-8": ("utf-8", 0x10FFFF, "raise"),
}

_AUTO_ENCODING_ORDER: tuple[DataMatrixEncoding, ...] = ("ascii", "iso-8859-1", "utf-8")


def get_suitable_encoding_for_codepoint(codepoint: int) -> DataMatrixEncoding:
    """Return the narrowest encoding from the auto-selection order that fits ``codepoint``."""
    return next(c for c in _AUTO_ENCODING_ORDER if codepoint <= _ENCODING_RULES[c][1])



[docs]
class DataMatrixData:
    """Composable encoder input mixing text chunks with raw-codeword markers.

    Build values by concatenating marker constants (e.g. :data:`FNC1`) with
    plain strings on either side, then pass the result to
    :class:`DataMatrixEncoder` in place of a ``str``.

    Construction requires either an explicit ``encoding=`` (one of
    ``"ascii"``, ``"iso-8859-1"``, ``"utf-8"`` or the legacy ``"compat"``)
    or ``auto_encoding=True``. With ``auto_encoding=True`` the constructor
    picks the narrowest encoding from ``ascii``, ``iso-8859-1``, ``utf-8``
    that represents every segment; any ``encoding=`` argument passed
    alongside is ignored. After construction, :attr:`encoding` is always
    one of the four concrete charsets.

    .. versionadded:: 0.11

    .. versionchanged:: 0.12
       Callers must now pass either an explicit ``encoding=`` or
       ``auto_encoding=True``. Added the ``auto_encoding`` flag.

    .. deprecated:: 0.11
       The ``"compat"`` encoding is retained only for backwards
       compatibility and will be removed in a future release. New code
       should pick ``"ascii"``, ``"iso-8859-1"`` or ``"utf-8"`` explicitly,
       or use ``auto_encoding=True``.
    """

    __slots__ = ("auto_encoding", "encoding", "segments")

    segments: tuple[str | DataMatrixCodeword, ...]
    encoding: DataMatrixEncoding
    auto_encoding: bool

    def __init__(
        self,
        *segments: str | DataMatrixCodeword,
        encoding: DataMatrixEncoding | None = None,
        auto_encoding: bool = False,
    ) -> None:
        if encoding is None and not auto_encoding:
            raise PyStrichInvalidOption(
                "DataMatrixData requires an explicit encoding= "
                "(one of 'ascii', 'iso-8859-1', 'utf-8', 'compat') "
                "or auto_encoding=True for automatic selection."
            )
        if encoding is not None and encoding not in _ENCODING_RULES:
            raise PyStrichInvalidOption(
                f"unknown DataMatrixData encoding {encoding!r}; "
                f"expected one of {sorted(_ENCODING_RULES)}"
            )

        # Type-check segments and find the highest codepoint in one pass.
        max_codepoint = 0
        for segment in segments:
            if isinstance(segment, DataMatrixCodeword):
                continue
            if not isinstance(segment, str):
                raise TypeError(
                    f"DataMatrixData segments must be str or DataMatrixCodeword, "
                    f"got {type(segment).__name__}"
                )
            max_codepoint = max(max_codepoint, max((ord(c) for c in segment), default=0))

        if auto_encoding:
            chosen = get_suitable_encoding_for_codepoint(max_codepoint)
        else:
            assert encoding is not None  # guaranteed by the early None+!auto check
            chosen = encoding
            charset, max_allowed, on_fail = _ENCODING_RULES[encoding]
            if max_codepoint > max_allowed:
                suggested = get_suitable_encoding_for_codepoint(max_codepoint)
                seg_args = ", ".join(repr(s) for s in segments)
                msg = (
                    f"DataMatrix encoding {encoding!r} expects {charset.upper()}; "
                    f"got {chr(max_codepoint)!r}. "
                    f"Try {type(self).__name__}({seg_args}, encoding={suggested!r})"
                    " or pass auto_encoding=True to select an encoding automatically."
                )
                if on_fail == "raise":
                    raise PyStrichInvalidInput(msg)
                warnings.warn(
                    msg + " Promote to error with "
                    "warnings.filterwarnings('error', category=PyStrichWarning).",
                    DataMatrixNonAsciiWarning,
                    stacklevel=2,
                )

        self.segments = segments
        self.encoding = chosen
        self.auto_encoding = auto_encoding

    def __add__(self, other):
        if isinstance(other, (str, DataMatrixCodeword)):
            new_segments = (*self.segments, other)
            other_auto = False
        elif isinstance(other, DataMatrixData):
            if not (self.auto_encoding or other.auto_encoding) and other.encoding != self.encoding:
                raise PyStrichInvalidOption(
                    f"cannot concatenate DataMatrixData with different encodings "
                    f"({self.encoding!r} and {other.encoding!r})"
                )
            new_segments = (*self.segments, *other.segments)
            other_auto = other.auto_encoding
        else:
            return NotImplemented
        return type(self)(
            *new_segments,
            encoding=self.encoding,
            auto_encoding=self.auto_encoding or other_auto,
        )

    def __radd__(self, other):
        if not isinstance(other, str):
            return NotImplemented
        return type(self)(
            other,
            *self.segments,
            encoding=self.encoding,
            auto_encoding=self.auto_encoding,
        )

    def __eq__(self, other):
        if type(self) is not type(other):
            return NotImplemented
        return self.segments == other.segments and self.encoding == other.encoding

    def __hash__(self):
        return hash((type(self), self.segments, self.encoding))

    def __repr__(self):
        return f"{type(self).__name__}({list(self.segments)!r})"




[docs]
class DataMatrixCodeword:
    """A literal DataMatrix codeword value to emit verbatim.

    Concatenation with a plain ``str`` or another codeword (e.g. ``FNC1 + "..."``)
    is the modern API path and produces a :class:`DataMatrixData` tagged with
    the strict ``"ascii"`` encoding. Concatenation with an existing
    :class:`DataMatrixData` preserves that object's encoding instead.

    .. versionadded:: 0.11
    """

    __slots__ = ("value",)

    value: int

    def __init__(self, value: int) -> None:
        if not 0 <= value <= 255:
            raise ValueError(f"codeword must be 0-255, got {value}")
        self.value = value

    def __add__(self, other):
        if isinstance(other, DataMatrixData):
            return DataMatrixData(self, *other.segments, encoding=other.encoding)
        if isinstance(other, (str, DataMatrixCodeword)):
            return DataMatrixData(self, other, encoding="ascii")
        return NotImplemented

    def __radd__(self, other):
        if isinstance(other, str):
            return DataMatrixData(other, self, encoding="ascii")
        return NotImplemented

    def __eq__(self, other):
        if type(self) is not type(other):
            return NotImplemented
        return self.value == other.value

    def __hash__(self):
        return hash((type(self), self.value))

    def __repr__(self):
        return f"{type(self).__name__}({self.value})"



# Codeword 232 — see https://github.com/mmulqueen/pyStrich/issues/13
FNC1 = DataMatrixCodeword(232)


def fnc1_workaround_compat(text: str, /) -> DataMatrixData:
    """Translate a leading chr(231) into an explicit FNC1 marker.

    Predates the FNC1 constant: callers triggered codeword 232 via the +1 ASCII
    offset bug. Non-leading chr(231) is left alone and falls through to the
    compat-mode warning. New code should use the FNC1 constant directly.

    See https://github.com/mmulqueen/pyStrich/issues/13.
    """
    if not text.startswith("\xe7"):
        return DataMatrixData(text, encoding="compat")

    warnings.warn(
        "chr(231) is being interpreted as FNC1 (codeword 232) for backwards "
        "compatibility with issue #13. New code should use the FNC1 constant "
        "from pystrich.datamatrix instead.",
        Fnc1WorkaroundCompatWarning,
        stacklevel=2,
    )

    segments: list[str | DataMatrixCodeword] = []
    for i, chunk in enumerate(text.split("\xe7")):
        if i > 0:
            segments.append(FNC1)
        if chunk:
            segments.append(chunk)
    return DataMatrixData(*segments, encoding="compat")