"""Main parsing and formatting functionality."""
from __future__ import annotations
import re
import datetime as dt
import random
import string
from functools import lru_cache
import typing
if typing.TYPE_CHECKING:
from _typeshed import StrOrLiteralStr
from typing import Any
from collections.abc import Iterable, Sequence, Mapping
[docs]
class Parser:
"""Class-based interface to parsing and formatting functionality."""
def __init__(self, fmt: str):
self.fmt = fmt
def __str__(self):
return self.fmt
[docs]
def keys(self):
"""Get parameter names defined in the format string."""
convert_dict = get_convert_dict(self.fmt)
return convert_dict.keys()
[docs]
def parse(self, stri: str, full_match: bool = True) -> dict[str, Any]:
"""Parse keys and values from ``stri`` using parser's format."""
return parse(self.fmt, stri, full_match=full_match)
[docs]
def compose(self, keyvals: Mapping[str, Any], allow_partial: bool = False) -> str:
"""Compose format string ``self.fmt`` with parameters given in the ``keyvals`` dict.
Args:
keyvals: "Parameter --> parameter value" map
allow_partial: If True, then partial composition is allowed, i.e.,
not all parameters present in `fmt` need to be specified in `keyvals`.
Unspecified parameters will, in this case, be left unchanged.
(Default value = False).
Returns:
Result of formatting the *self.fmt* string with parameter values
extracted from the corresponding items in the *keyvals* dictionary.
"""
return compose(fmt=self.fmt, keyvals=keyvals, allow_partial=allow_partial)
format = compose
[docs]
def globify(self, keyvals: Mapping[str, Any] | None = None) -> str:
"""Generate a string usable with glob.glob() from format string."""
return globify(self.fmt, keyvals)
[docs]
def validate(self, stri: str) -> bool:
"""Validate that string ``stri`` conforms to the parser's format definition.
Checks that the provided string is parsable and therefore complies with
this parser's string format definition. Useful for filtering strings,
or to check if a string is compatible before passing it to the
parser function.
"""
return validate(self.fmt, stri)
[docs]
def is_one2one(self):
"""Check if this parser's format string has a one to one correspondence.
That is, that successive composing and
parsing operations will result in the original data.
In other words, that input data maps to a string,
which then maps back to the original data without any change
or loss in information.
Note: This test only applies to sensible usage of the format string.
If string or numeric data causes overflow, e.g.
if composing "abcd" into ``{3s}``, one to one correspondence will always
be broken in such cases. This of course also applies to precision
losses when using datetime data.
"""
return is_one2one(self.fmt)
formatter = StringFormatter()
# taken from https://docs.python.org/3/library/re.html#simulating-scanf
spec_regexes = {
"b": r"[-+]?[0-1]",
"c": r".",
"d": r"[-+]?\d",
# Naive fixed point format specifier (e.g. {foo:f})
"f": r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?",
# Fixed point format specifier including width and precision
# (e.g. {foo:4.2f}). The lookahead (?=.{width}) makes sure that the
# subsequent pattern is only matched if the string has the required
# (minimum) width.
"f_with_precision": r"(?=.{{{width}}})([-+]?([\d ]+(\.\d{{{decimals}}})+|\.\d{{{decimals}}})([eE][-+]?\d+)?)",
"i": r"[-+]?(0[xX][\dA-Fa-f]+|0[0-7]*|\d+)",
"o": r"[-+]?[0-7]",
"s": r"\S",
"x": r"[-+]?(0[xX])?[\dA-Fa-f]",
}
spec_regexes["e"] = spec_regexes["f"]
spec_regexes["E"] = spec_regexes["f"]
spec_regexes["g"] = spec_regexes["f"]
spec_regexes["X"] = spec_regexes["x"]
spec_regexes[""] = spec_regexes["s"]
allow_multiple = ["b", "c", "d", "o", "s", "", "x", "X"]
fixed_point_types = ["f", "e", "E", "g"]
# format_spec ::= [[fill]align][sign][#][0][width][,][.precision][type]
# https://docs.python.org/3.4/library/string.html#format-specification-mini-language
fmt_spec_regex = re.compile(
r"(?P<align>(?P<fill>.)?[<>=^])?(?P<sign>[\+\-\s])?(?P<pound>#)?(?P<zero>0)?(?P<width>\d+)?"
r"(?P<comma>,)?(?P<precision>.\d+)?(?P<type>[bcdeEfFgGnosxX%]?)"
)
def _get_fixed_point_regex(width: str | None, precision: str | None) -> str:
"""Get regular expression for fixed point numbers.
Args:
width: Total width of the string representation.
precision: Number of decimals.
"""
if width or precision:
if precision is None:
precision = "0,"
else:
precision = precision.strip(".")
if width is None:
width = "1,"
return spec_regexes["f_with_precision"].format(width=width, decimals=precision)
else:
return spec_regexes["f"]
def _get_fill(fill: str | None, width: str | None, ftype: str | None) -> str | None:
# NOTE: does not properly handle `=` alignment
if fill is None:
if width is not None and width[0] == "0":
fill = "0"
elif ftype in ["s", "", "d", "x", "X", "o", "b"]:
fill = " "
return fill
def _get_number_from_fmt(fmt: str) -> int:
"""Helper function for extract_values.
Figures out string length from format string.
"""
if "%" in fmt:
# its datetime
return len(("{0:" + fmt + "}").format(dt.datetime.now()))
else:
# its something else
fmt = fmt.lstrip("0")
fmt_digits_match = re.search("[0-9]+", fmt)
if fmt_digits_match is None:
raise ValueError(f"No number specified in format string: {fmt}")
return int(fmt_digits_match.group(0))
def _convert(convdef: str, stri: str) -> Any:
"""Convert the string *stri* to the given conversion definition *convdef*."""
result: Any # force mypy type
if "%" in convdef:
result = dt.datetime.strptime(stri, convdef)
else:
result = _strip_padding(convdef, stri)
if "d" in convdef:
result = int(result)
elif "x" in convdef or "X" in convdef:
result = int(result, 16)
elif "o" in convdef:
result = int(result, 8)
elif "b" in convdef:
result = int(result, 2)
elif any(float_type_marker in convdef for float_type_marker in fixed_point_types):
result = float(result)
return result
def _strip_padding(convdef: str, stri: str) -> str:
"""Strip padding from the given string.
Args:
convdef: Conversion definition (indicates the padding)
stri: String to be modified
"""
regex_match = fmt_spec_regex.match(convdef)
match_dict = regex_match.groupdict() if regex_match else {}
align = match_dict.get("align")
pad = match_dict.get("fill")
if align:
# align character is the last one
align = align[-1]
if align and align in "<>^" and not pad:
pad = " "
if align == ">":
stri = stri.lstrip(pad)
elif align == "<":
stri = stri.rstrip(pad)
elif align == "^":
stri = stri.strip(pad)
return stri
[docs]
@lru_cache()
def get_convert_dict(fmt: str) -> dict[str, str]:
"""Retrieve parse definition from the format string `fmt`."""
convdef = {}
for _literal_text, field_name, format_spec, _conversion in formatter.parse(fmt):
if field_name is None or format_spec is None:
continue
# XXX: Do I need to include 'conversion'?
convdef[field_name] = format_spec
return convdef
[docs]
def parse(fmt: str, stri: str, full_match: bool = True) -> dict[str, Any]:
"""Parse keys and corresponding values from *stri* using format described in *fmt* string.
Args:
fmt: Python format string to match against
stri: String to extract information from
full_match: Force the match of the whole string. Default True.
"""
convdef = get_convert_dict(fmt)
keyvals = extract_values(fmt, stri, full_match=full_match)
for key in convdef.keys():
keyvals[key] = _convert(convdef[key], keyvals[key])
return keyvals
[docs]
def compose(fmt: str, keyvals: Mapping[str, Any], allow_partial: bool = False) -> str:
"""Compose format string *self.fmt* with parameters given in the *keyvals* dict.
Args:
fmt: Python format string to match against
keyvals: "Parameter --> parameter value" map
allow_partial: If True, then partial composition is allowed, i.e.,
not all parameters present in `fmt` need to be specified in `keyvals`.
Unspecified parameters will, in this case, be left unchanged.
(Default value = False).
Returns:
Result of formatting the *self.fmt* string with parameter values
extracted from the corresponding items in the *keyvals* dictionary.
"""
if allow_partial:
return _partial_compose(fmt=fmt, keyvals=keyvals)
return _strict_compose(fmt=fmt, keyvals=keyvals)
DT_FMT = {
"%a": "*",
"%A": "*",
"%w": "?",
"%d": "??",
"%b": "*",
"%B": "*",
"%m": "??",
"%y": "??",
"%Y": "????",
"%H": "??",
"%I": "??",
"%p": "*",
"%M": "??",
"%S": "??",
"%f": "*",
"%z": "*",
"%Z": "*",
"%j": "???",
"%U": "??",
"%W": "??",
"%c": "*",
"%x": "*",
"%X": "*",
"%%": "?",
}
globify_formatter = GlobifyFormatter()
[docs]
def globify(fmt: str, keyvals: Mapping[str, Any] | None = None) -> Any:
"""Generate a string usable with glob.glob() from format string and provided information."""
if keyvals is None:
keyvals = {}
return globify_formatter.format(fmt, **keyvals)
[docs]
def validate(fmt: str, stri: str) -> bool:
"""Validates that string ``stri`` conforms to ``fmt``.
Useful for filtering string, or to check if string is compatible before
passing the string to the parser function.
"""
try:
parse(fmt, stri)
return True
except ValueError:
return False
def _generate_data_for_format(fmt: str) -> dict[str, Any]:
"""Generate a fake data dictionary to fill in the provided format string."""
# finally try some data, create some random data for the fmt.
data = {}
# keep track of how many "free_size" (wildcard) parameters we have
# if we get two in a row then we know the pattern is invalid, meaning
# we'll never be able to match the second wildcard field
free_size_start = False
for literal_text, field_name, format_spec, _conversion in formatter.parse(fmt):
if literal_text:
free_size_start = False
if not field_name:
free_size_start = False
continue
# encapsulating free size keys,
# e.g. {:s}{:s} or {:s}{:4s}{:d}
if not format_spec or format_spec == "s" or format_spec == "d":
if free_size_start:
raise ValueError("Can't generate data for spec with two or more fields with no size specifier.")
else:
free_size_start = True
# make some data for this key and format
data[field_name] = _gen_data_for_spec(format_spec)
return data
def _gen_data_for_spec(format_spec: str | None) -> int | str | dt.datetime:
if format_spec and "%" in format_spec:
# some datetime
t = dt.datetime.now()
# run once through format to limit precision
t = parse("{t:" + format_spec + "}", compose("{t:" + format_spec + "}", {"t": t}))["t"]
return t
if format_spec and "d" in format_spec:
# random number (with n sign. figures)
if not format_spec.isalpha():
n = _get_number_from_fmt(format_spec)
else:
# clearly bad
raise ValueError(f"Bad format specification: {format_spec!r}")
return random.randint(0, 99999999999999999) % (10**n)
# string type
if format_spec is None:
n = 4
elif format_spec.isalnum():
n = _get_number_from_fmt(format_spec)
else:
n = 4
randstri = ""
for _ in range(n):
randstri += random.choice(string.ascii_letters)
return randstri
[docs]
def is_one2one(fmt: str) -> bool:
"""Check if the format string has a one to one correspondence.
That is, that successive composing and
parsing operations will result in the original data.
In other words, that input data maps to a string,
which then maps back to the original data without any change
or loss in information.
Note: This test only applies to sensible usage of the format string.
If string or numeric data is causes overflow, e.g.
if composing "abcd" into {3s}, one to one correspondence will always
be broken in such cases. This of course also applies to precision
losses when using datetime data.
"""
try:
data = _generate_data_for_format(fmt)
except ValueError:
return False
# run data forward once and back to data
stri = compose(fmt, data)
data2 = parse(fmt, stri)
# check if data2 equal to original data
if len(data) != len(data2):
return False
for key in data:
if key not in data2:
return False
if data2[key] != data[key]:
return False
# all checks passed, so just return True
return True
[docs]
def purge() -> None:
"""Clear internal caches.
Not needed normally, but can be used to force cache clear when memory
is very limited.
"""
regex_format.cache_clear()
get_convert_dict.cache_clear()
def _strict_compose(fmt: str, keyvals: Mapping[str, Any]) -> str:
"""Convert parameters in `keyvals` to a string based on `fmt` string."""
return formatter.format(fmt, **keyvals)
def _partial_compose(fmt: str, keyvals: Mapping[str, Any]) -> str:
"""Convert parameters in `keyvals` to a string based on `fmt` string.
Similar to _strict_compose, but accepts partial composing, i.e., not all
parameters in `fmt` need to be specified in `keyvals`. Unspecified parameters
are left unchanged.
Args:
fmt (str): Python format string to match against
keyvals (dict): "Parameter --> parameter value" map
"""
fmt, undefined_vars = _replace_undefined_params_with_placeholders(fmt, keyvals)
composed_string = _strict_compose(fmt=fmt, keyvals=keyvals)
for fmt_placeholder, fmt_specification in undefined_vars.items():
composed_string = composed_string.replace(fmt_placeholder, fmt_specification)
return composed_string
def _replace_undefined_params_with_placeholders(
fmt: str, keyvals: Mapping[str, Any] | None = None
) -> tuple[str, dict[str, Any]]:
"""Replace with placeholders params in `fmt` not specified in `keyvals`."""
vars_left_undefined = set(get_convert_dict(fmt).keys())
if keyvals is not None:
vars_left_undefined -= keyvals.keys()
undefined_vars_placeholders_dict = {}
new_fmt = fmt
for var in sorted(vars_left_undefined):
matches = set(match.group() for match in re.finditer(rf"{{{re.escape(var)}([^\w{{}}].*?)*}}", new_fmt))
if len(matches) == 0:
raise ValueError(f"Could not capture definitions for {var} from {fmt}")
for var_specification in matches:
fmt_placeholder = f"({hex(hash(var_specification))})"
undefined_vars_placeholders_dict[fmt_placeholder] = var_specification
new_fmt = new_fmt.replace(var_specification, fmt_placeholder)
return new_fmt, undefined_vars_placeholders_dict