"""DataMatrix-specific composition types and marker constants."""
from __future__ import annotations
import warnings
from typing import Literal
from pystrich.exceptions import (
DataMatrixNonAsciiWarning,
Fnc1WorkaroundCompatWarning,
PyStrichInvalidInput,
PyStrichInvalidOption,
)
DataMatrixEncoding = Literal["compat", "ascii", "iso-8859-1", "utf-8"]
_ENCODING_RULES: dict[DataMatrixEncoding, tuple[str, int, Literal["warn", "raise"]]] = {
# encoding name -> (Python codec name, max representable codepoint, on-fail policy)
"compat": ("ascii", 0x7F, "warn"),
"ascii": ("ascii", 0x7F, "raise"),
"iso-8859-1": ("iso-8859-1", 0xFF, "raise"),
"utf-8": ("utf-8", 0x10FFFF, "raise"),
}
_AUTO_ENCODING_ORDER: tuple[DataMatrixEncoding, ...] = ("ascii", "iso-8859-1", "utf-8")
def get_suitable_encoding_for_codepoint(codepoint: int) -> DataMatrixEncoding:
"""Return the narrowest encoding from the auto-selection order that fits ``codepoint``."""
return next(c for c in _AUTO_ENCODING_ORDER if codepoint <= _ENCODING_RULES[c][1])
[docs]
class DataMatrixData:
"""Composable encoder input mixing text chunks with raw-codeword markers.
Build values by concatenating marker constants (e.g. :data:`FNC1`) with
plain strings on either side, then pass the result to
:class:`DataMatrixEncoder` in place of a ``str``.
Construction requires either an explicit ``encoding=`` (one of
``"ascii"``, ``"iso-8859-1"``, ``"utf-8"`` or the legacy ``"compat"``)
or ``auto_encoding=True``. With ``auto_encoding=True`` the constructor
picks the narrowest encoding from ``ascii``, ``iso-8859-1``, ``utf-8``
that represents every segment; any ``encoding=`` argument passed
alongside is ignored. After construction, :attr:`encoding` is always
one of the four concrete charsets.
.. versionadded:: 0.11
.. versionchanged:: 0.12
Callers must now pass either an explicit ``encoding=`` or
``auto_encoding=True``. Added the ``auto_encoding`` flag.
.. deprecated:: 0.11
The ``"compat"`` encoding is retained only for backwards
compatibility and will be removed in a future release. New code
should pick ``"ascii"``, ``"iso-8859-1"`` or ``"utf-8"`` explicitly,
or use ``auto_encoding=True``.
"""
__slots__ = ("auto_encoding", "encoding", "segments")
segments: tuple[str | DataMatrixCodeword, ...]
encoding: DataMatrixEncoding
auto_encoding: bool
def __init__(
self,
*segments: str | DataMatrixCodeword,
encoding: DataMatrixEncoding | None = None,
auto_encoding: bool = False,
) -> None:
if encoding is None and not auto_encoding:
raise PyStrichInvalidOption(
"DataMatrixData requires an explicit encoding= "
"(one of 'ascii', 'iso-8859-1', 'utf-8', 'compat') "
"or auto_encoding=True for automatic selection."
)
if encoding is not None and encoding not in _ENCODING_RULES:
raise PyStrichInvalidOption(
f"unknown DataMatrixData encoding {encoding!r}; "
f"expected one of {sorted(_ENCODING_RULES)}"
)
# Type-check segments and find the highest codepoint in one pass.
max_codepoint = 0
for segment in segments:
if isinstance(segment, DataMatrixCodeword):
continue
if not isinstance(segment, str):
raise TypeError(
f"DataMatrixData segments must be str or DataMatrixCodeword, "
f"got {type(segment).__name__}"
)
max_codepoint = max(max_codepoint, max((ord(c) for c in segment), default=0))
if auto_encoding:
chosen = get_suitable_encoding_for_codepoint(max_codepoint)
else:
assert encoding is not None # guaranteed by the early None+!auto check
chosen = encoding
charset, max_allowed, on_fail = _ENCODING_RULES[encoding]
if max_codepoint > max_allowed:
suggested = get_suitable_encoding_for_codepoint(max_codepoint)
seg_args = ", ".join(repr(s) for s in segments)
msg = (
f"DataMatrix encoding {encoding!r} expects {charset.upper()}; "
f"got {chr(max_codepoint)!r}. "
f"Try {type(self).__name__}({seg_args}, encoding={suggested!r})"
" or pass auto_encoding=True to select an encoding automatically."
)
if on_fail == "raise":
raise PyStrichInvalidInput(msg)
warnings.warn(
msg + " Promote to error with "
"warnings.filterwarnings('error', category=PyStrichWarning).",
DataMatrixNonAsciiWarning,
stacklevel=2,
)
self.segments = segments
self.encoding = chosen
self.auto_encoding = auto_encoding
def __add__(self, other):
if isinstance(other, (str, DataMatrixCodeword)):
new_segments = (*self.segments, other)
other_auto = False
elif isinstance(other, DataMatrixData):
if not (self.auto_encoding or other.auto_encoding) and other.encoding != self.encoding:
raise PyStrichInvalidOption(
f"cannot concatenate DataMatrixData with different encodings "
f"({self.encoding!r} and {other.encoding!r})"
)
new_segments = (*self.segments, *other.segments)
other_auto = other.auto_encoding
else:
return NotImplemented
return type(self)(
*new_segments,
encoding=self.encoding,
auto_encoding=self.auto_encoding or other_auto,
)
def __radd__(self, other):
if not isinstance(other, str):
return NotImplemented
return type(self)(
other,
*self.segments,
encoding=self.encoding,
auto_encoding=self.auto_encoding,
)
def __eq__(self, other):
if type(self) is not type(other):
return NotImplemented
return self.segments == other.segments and self.encoding == other.encoding
def __hash__(self):
return hash((type(self), self.segments, self.encoding))
def __repr__(self):
return f"{type(self).__name__}({list(self.segments)!r})"
[docs]
class DataMatrixCodeword:
"""A literal DataMatrix codeword value to emit verbatim.
Concatenation with a plain ``str`` or another codeword (e.g. ``FNC1 + "..."``)
is the modern API path and produces a :class:`DataMatrixData` tagged with
the strict ``"ascii"`` encoding. Concatenation with an existing
:class:`DataMatrixData` preserves that object's encoding instead.
.. versionadded:: 0.11
"""
__slots__ = ("value",)
value: int
def __init__(self, value: int) -> None:
if not 0 <= value <= 255:
raise ValueError(f"codeword must be 0-255, got {value}")
self.value = value
def __add__(self, other):
if isinstance(other, DataMatrixData):
return DataMatrixData(self, *other.segments, encoding=other.encoding)
if isinstance(other, (str, DataMatrixCodeword)):
return DataMatrixData(self, other, encoding="ascii")
return NotImplemented
def __radd__(self, other):
if isinstance(other, str):
return DataMatrixData(other, self, encoding="ascii")
return NotImplemented
def __eq__(self, other):
if type(self) is not type(other):
return NotImplemented
return self.value == other.value
def __hash__(self):
return hash((type(self), self.value))
def __repr__(self):
return f"{type(self).__name__}({self.value})"
# Codeword 232 — see https://github.com/mmulqueen/pyStrich/issues/13
FNC1 = DataMatrixCodeword(232)
def fnc1_workaround_compat(text: str, /) -> DataMatrixData:
"""Translate a leading chr(231) into an explicit FNC1 marker.
Predates the FNC1 constant: callers triggered codeword 232 via the +1 ASCII
offset bug. Non-leading chr(231) is left alone and falls through to the
compat-mode warning. New code should use the FNC1 constant directly.
See https://github.com/mmulqueen/pyStrich/issues/13.
"""
if not text.startswith("\xe7"):
return DataMatrixData(text, encoding="compat")
warnings.warn(
"chr(231) is being interpreted as FNC1 (codeword 232) for backwards "
"compatibility with issue #13. New code should use the FNC1 constant "
"from pystrich.datamatrix instead.",
Fnc1WorkaroundCompatWarning,
stacklevel=2,
)
segments: list[str | DataMatrixCodeword] = []
for i, chunk in enumerate(text.split("\xe7")):
if i > 0:
segments.append(FNC1)
if chunk:
segments.append(chunk)
return DataMatrixData(*segments, encoding="compat")