Source code for trollsift.parser

"""Main parsing and formatting functionality."""

from __future__ import annotations

import re
import datetime as dt
import random
import string
from functools import lru_cache
import typing

if typing.TYPE_CHECKING:
    from _typeshed import StrOrLiteralStr
    from typing import Any
    from collections.abc import Iterable, Sequence, Mapping


[docs] class Parser: """Class-based interface to parsing and formatting functionality.""" def __init__(self, fmt: str): self.fmt = fmt def __str__(self): return self.fmt
[docs] def keys(self): """Get parameter names defined in the format string.""" convert_dict = get_convert_dict(self.fmt) return convert_dict.keys()
[docs] def parse(self, stri: str, full_match: bool = True) -> dict[str, Any]: """Parse keys and values from ``stri`` using parser's format.""" return parse(self.fmt, stri, full_match=full_match)
[docs] def compose(self, keyvals: Mapping[str, Any], allow_partial: bool = False) -> str: """Compose format string ``self.fmt`` with parameters given in the ``keyvals`` dict. Args: keyvals: "Parameter --> parameter value" map allow_partial: If True, then partial composition is allowed, i.e., not all parameters present in `fmt` need to be specified in `keyvals`. Unspecified parameters will, in this case, be left unchanged. (Default value = False). Returns: Result of formatting the *self.fmt* string with parameter values extracted from the corresponding items in the *keyvals* dictionary. """ return compose(fmt=self.fmt, keyvals=keyvals, allow_partial=allow_partial)
format = compose
[docs] def globify(self, keyvals: Mapping[str, Any] | None = None) -> str: """Generate a string usable with glob.glob() from format string.""" return globify(self.fmt, keyvals)
[docs] def validate(self, stri: str) -> bool: """Validate that string ``stri`` conforms to the parser's format definition. Checks that the provided string is parsable and therefore complies with this parser's string format definition. Useful for filtering strings, or to check if a string is compatible before passing it to the parser function. """ return validate(self.fmt, stri)
[docs] def is_one2one(self): """Check if this parser's format string has a one to one correspondence. That is, that successive composing and parsing operations will result in the original data. In other words, that input data maps to a string, which then maps back to the original data without any change or loss in information. Note: This test only applies to sensible usage of the format string. If string or numeric data causes overflow, e.g. if composing "abcd" into ``{3s}``, one to one correspondence will always be broken in such cases. This of course also applies to precision losses when using datetime data. """ return is_one2one(self.fmt)
[docs] class StringFormatter(string.Formatter): """Custom string formatter class for basic strings. This formatter adds a few special conversions for assisting with common trollsift situations like making a parameter lowercase or removing hyphens. The added conversions are listed below and can be used in a format string by prefixing them with an `!` like so: >>> fstr = "{!u}_{!l}" >>> formatter = StringFormatter() >>> formatter.format(fstr, "to_upper", "To_LowerCase") "TO_UPPER_to_lowercase" - c: Make capitalized version of string (first character upper case, all lowercase after that) by executing the parameter's `.capitalize()` method. - l: Make all characters lowercase by executing the parameter's `.lower()` method. - R: Remove all separators from the parameter including '-', '_', ' ', and ':'. - t: Title case the string by executing the parameter's `.title()` method. - u: Make all characters uppercase by executing the parameter's `.upper()` method. - h: A combination of 'R' and 'l'. - H: A combination of 'R' and 'u'. """ CONV_FUNCS = { "c": "capitalize", "h": "lower", "H": "upper", "l": "lower", "t": "title", "u": "upper", }
[docs] def convert_field(self, value: str, conversion: str | None) -> str: """Apply conversions mentioned in `StringFormatter.CONV_FUNCS`.""" if conversion is None: func = None else: func = self.CONV_FUNCS.get(conversion) if func is not None: value = getattr(value, func)() elif conversion not in ["R"]: # default conversion ('r', 's') return super(StringFormatter, self).convert_field(value, conversion) if conversion in ["h", "H", "R"]: value = value.replace("-", "").replace("_", "").replace(":", "").replace(" ", "") return value
formatter = StringFormatter() # taken from https://docs.python.org/3/library/re.html#simulating-scanf spec_regexes = { "b": r"[-+]?[0-1]", "c": r".", "d": r"[-+]?\d", # Naive fixed point format specifier (e.g. {foo:f}) "f": r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?", # Fixed point format specifier including width and precision # (e.g. {foo:4.2f}). The lookahead (?=.{width}) makes sure that the # subsequent pattern is only matched if the string has the required # (minimum) width. "f_with_precision": r"(?=.{{{width}}})([-+]?([\d ]+(\.\d{{{decimals}}})+|\.\d{{{decimals}}})([eE][-+]?\d+)?)", "i": r"[-+]?(0[xX][\dA-Fa-f]+|0[0-7]*|\d+)", "o": r"[-+]?[0-7]", "s": r"\S", "x": r"[-+]?(0[xX])?[\dA-Fa-f]", } spec_regexes["e"] = spec_regexes["f"] spec_regexes["E"] = spec_regexes["f"] spec_regexes["g"] = spec_regexes["f"] spec_regexes["X"] = spec_regexes["x"] spec_regexes[""] = spec_regexes["s"] allow_multiple = ["b", "c", "d", "o", "s", "", "x", "X"] fixed_point_types = ["f", "e", "E", "g"] # format_spec ::= [[fill]align][sign][#][0][width][,][.precision][type] # https://docs.python.org/3.4/library/string.html#format-specification-mini-language fmt_spec_regex = re.compile( r"(?P<align>(?P<fill>.)?[<>=^])?(?P<sign>[\+\-\s])?(?P<pound>#)?(?P<zero>0)?(?P<width>\d+)?" r"(?P<comma>,)?(?P<precision>.\d+)?(?P<type>[bcdeEfFgGnosxX%]?)" ) def _get_fixed_point_regex(width: str | None, precision: str | None) -> str: """Get regular expression for fixed point numbers. Args: width: Total width of the string representation. precision: Number of decimals. """ if width or precision: if precision is None: precision = "0," else: precision = precision.strip(".") if width is None: width = "1," return spec_regexes["f_with_precision"].format(width=width, decimals=precision) else: return spec_regexes["f"]
[docs] class RegexFormatter(string.Formatter): """String formatter that converts a format string to a regular expression. >>> regex_formatter = RegexFormatter() >>> regex_str = regex_formatter.format('{field_one:5d}_{field_two}') Can also be used to extract values from a string given the format spec for that string: >>> regex_formatter.extract_values('{field_one:5d}_{field_two}', '12345_sometext') {'field_one': '12345', 'field_two': 'sometext'} Note that the regular expressions generated by this class are specially generated to reduce "greediness" of the matches found. For ambiguous patterns where a single field could match shorter or longer portions of the provided string, this class will prefer the shorter version of the string in order to make the rest of the pattern match. For example: >>> regex_formatter.extract_values('{field_one}_{field_two}', 'abc_def_ghi') {'field_one': 'abc', 'field_two': 'def_ghi'} Note how `field_one` could have matched "abc_def", but the lower greediness of this parser caused it to only match against "abc". """ # special string to mark a parameter not being specified UNPROVIDED_VALUE = "<trollsift unprovided value>" ESCAPE_CHARACTERS = ["\\"] + [x for x in string.punctuation if x not in "\\%"] ESCAPE_SETS = [(c, "\\" + c) for c in ESCAPE_CHARACTERS] def __init__(self): # hold on to fields we've seen already so we can reuse their # definitions in the regex self._cached_fields = {} self.format = lru_cache()(self._uncached_format) super(RegexFormatter, self).__init__() def _uncached_format(*args, **kwargs): try: # super() doesn't seem to work here ret_val = string.Formatter.format(*args, **kwargs) finally: self = args[0] # just matching the parent class self._cached_fields.clear() return ret_val def _escape(self, s: str) -> str: """Escape bad characters for regular expressions. Similar to `re.escape` but allows '%' to pass through. """ for ch, r_ch in self.ESCAPE_SETS: s = s.replace(ch, r_ch) return s
[docs] def parse( self, format_string: StrOrLiteralStr ) -> Iterable[ tuple[ StrOrLiteralStr, StrOrLiteralStr | None, StrOrLiteralStr | None, StrOrLiteralStr | None, ] ]: parse_ret = super(RegexFormatter, self).parse(format_string) for literal_text, field_name, format_spec, conversion in parse_ret: # the parent class will call parse multiple times moving # 'format_spec' to 'literal_text'. We only escape 'literal_text' # so we don't escape things twice. literal_text = self._escape(literal_text) yield literal_text, field_name, format_spec, conversion
[docs] def get_value(self, key: int | str, args: Sequence[Any], kwargs: Mapping[str, Any]) -> Any: try: return super(RegexFormatter, self).get_value(key, args, kwargs) except (IndexError, KeyError): return key, self.UNPROVIDED_VALUE
def _regex_datetime(self, format_spec: str) -> str: replace_str = format_spec for fmt_key, fmt_val in DT_FMT.items(): if fmt_key == "%%": # special case replace_str.replace("%%", "%") continue count = fmt_val.count("?") # either a series of numbers or letters/numbers regex = r"\d{{{:d}}}".format(count) if count else r"[^ \t\n\r\f\v\-_:]+" replace_str = replace_str.replace(fmt_key, regex) return replace_str
[docs] def regex_field(self, field_name: str, value: Any, format_spec: str) -> str: if value != self.UNPROVIDED_VALUE: return super(RegexFormatter, self).format_field(value, format_spec) if self._cached_fields.get(field_name, format_spec) != format_spec: raise ValueError("Can't specify the same field_name with different formats: {}".format(field_name)) elif field_name in self._cached_fields: return r"(?P={})".format(field_name) else: self._cached_fields[field_name] = format_spec # Replace format spec with glob patterns (*, ?, etc) if not format_spec: return r"(?P<{}>.*?)".format(field_name) if "%" in format_spec: return r"(?P<{}>{})".format(field_name, self._regex_datetime(format_spec)) return format_spec_to_regex(field_name, format_spec)
[docs] def format_field(self, value: Any, format_spec: str) -> str: if not isinstance(value, tuple) or value[1] != self.UNPROVIDED_VALUE: return super(RegexFormatter, self).format_field(value, format_spec) field_name, value = value return self.regex_field(field_name, value, format_spec)
[docs] def format_spec_to_regex(field_name: str, format_spec: str) -> str: """Make an attempt at converting a format spec to a regular expression.""" # NOTE: remove escaped backslashes so regex matches regex_match = fmt_spec_regex.match(format_spec.replace("\\", "")) if regex_match is None: raise ValueError("Invalid format specification: '{}'".format(format_spec)) regex_dict = regex_match.groupdict() ftype = regex_dict["type"] width = regex_dict["width"] align = regex_dict["align"] precision = regex_dict["precision"] fill = _get_fill(regex_dict["fill"], width, ftype) char_type = spec_regexes[ftype] if ftype in fixed_point_types: char_type = _get_fixed_point_regex(width=width, precision=precision) if ftype in ("s", "") and align and align.endswith("="): raise ValueError("Invalid format specification: '{}'".format(format_spec)) final_regex = char_type if ftype in allow_multiple and (not width or width == "0"): final_regex += r"*?" elif width and width != "0": if not fill and ftype not in fixed_point_types: # we know we have exactly this many characters final_regex += r"{{{}}}".format(int(width)) elif fill: # we don't know how many fill characters we have compared to # field characters so just match all characters and sort it out # later during type conversion. final_regex = r".{{{}}}".format(int(width)) elif ftype in allow_multiple: final_regex += r"*?" return r"(?P<{}>{})".format(field_name, final_regex)
def _get_fill(fill: str | None, width: str | None, ftype: str | None) -> str | None: # NOTE: does not properly handle `=` alignment if fill is None: if width is not None and width[0] == "0": fill = "0" elif ftype in ["s", "", "d", "x", "X", "o", "b"]: fill = " " return fill
[docs] @lru_cache() def regex_format(fmt: str) -> str: # We create a new instance of RegexFormatter here to prevent concurrent calls to # format interfering with one another. return RegexFormatter().format(fmt)
[docs] def extract_values(fmt: str, stri: str, full_match: bool = True) -> dict[str, Any]: """Extract information from string matching format. Args: fmt: Python format string to match against stri: String to extract information from full_match: Force the match of the whole string. Default to ``True``. """ regex = regex_format(fmt) if full_match: regex = "^" + regex + "$" match = re.match(regex, stri) if match is None: raise ValueError("String does not match pattern.") return match.groupdict()
def _get_number_from_fmt(fmt: str) -> int: """Helper function for extract_values. Figures out string length from format string. """ if "%" in fmt: # its datetime return len(("{0:" + fmt + "}").format(dt.datetime.now())) else: # its something else fmt = fmt.lstrip("0") fmt_digits_match = re.search("[0-9]+", fmt) if fmt_digits_match is None: raise ValueError(f"No number specified in format string: {fmt}") return int(fmt_digits_match.group(0)) def _convert(convdef: str, stri: str) -> Any: """Convert the string *stri* to the given conversion definition *convdef*.""" result: Any # force mypy type if "%" in convdef: result = dt.datetime.strptime(stri, convdef) else: result = _strip_padding(convdef, stri) if "d" in convdef: result = int(result) elif "x" in convdef or "X" in convdef: result = int(result, 16) elif "o" in convdef: result = int(result, 8) elif "b" in convdef: result = int(result, 2) elif any(float_type_marker in convdef for float_type_marker in fixed_point_types): result = float(result) return result def _strip_padding(convdef: str, stri: str) -> str: """Strip padding from the given string. Args: convdef: Conversion definition (indicates the padding) stri: String to be modified """ regex_match = fmt_spec_regex.match(convdef) match_dict = regex_match.groupdict() if regex_match else {} align = match_dict.get("align") pad = match_dict.get("fill") if align: # align character is the last one align = align[-1] if align and align in "<>^" and not pad: pad = " " if align == ">": stri = stri.lstrip(pad) elif align == "<": stri = stri.rstrip(pad) elif align == "^": stri = stri.strip(pad) return stri
[docs] @lru_cache() def get_convert_dict(fmt: str) -> dict[str, str]: """Retrieve parse definition from the format string `fmt`.""" convdef = {} for _literal_text, field_name, format_spec, _conversion in formatter.parse(fmt): if field_name is None or format_spec is None: continue # XXX: Do I need to include 'conversion'? convdef[field_name] = format_spec return convdef
[docs] def parse(fmt: str, stri: str, full_match: bool = True) -> dict[str, Any]: """Parse keys and corresponding values from *stri* using format described in *fmt* string. Args: fmt: Python format string to match against stri: String to extract information from full_match: Force the match of the whole string. Default True. """ convdef = get_convert_dict(fmt) keyvals = extract_values(fmt, stri, full_match=full_match) for key in convdef.keys(): keyvals[key] = _convert(convdef[key], keyvals[key]) return keyvals
[docs] def compose(fmt: str, keyvals: Mapping[str, Any], allow_partial: bool = False) -> str: """Compose format string *self.fmt* with parameters given in the *keyvals* dict. Args: fmt: Python format string to match against keyvals: "Parameter --> parameter value" map allow_partial: If True, then partial composition is allowed, i.e., not all parameters present in `fmt` need to be specified in `keyvals`. Unspecified parameters will, in this case, be left unchanged. (Default value = False). Returns: Result of formatting the *self.fmt* string with parameter values extracted from the corresponding items in the *keyvals* dictionary. """ if allow_partial: return _partial_compose(fmt=fmt, keyvals=keyvals) return _strict_compose(fmt=fmt, keyvals=keyvals)
DT_FMT = { "%a": "*", "%A": "*", "%w": "?", "%d": "??", "%b": "*", "%B": "*", "%m": "??", "%y": "??", "%Y": "????", "%H": "??", "%I": "??", "%p": "*", "%M": "??", "%S": "??", "%f": "*", "%z": "*", "%Z": "*", "%j": "???", "%U": "??", "%W": "??", "%c": "*", "%x": "*", "%X": "*", "%%": "?", }
[docs] class GlobifyFormatter(string.Formatter): # special string to mark a parameter not being specified UNPROVIDED_VALUE = "<trollsift unprovided value>"
[docs] def get_value(self, key: str | int, args: Sequence[Any], kwargs: Mapping[str, Any]) -> Any: try: return super(GlobifyFormatter, self).get_value(key, args, kwargs) except (IndexError, KeyError): # assumes that return self.UNPROVIDED_VALUE
[docs] def format_field(self, value: Any, format_spec: str) -> str: if not isinstance(value, (list, tuple)) and value != self.UNPROVIDED_VALUE: return super(GlobifyFormatter, self).format_field(value, format_spec) elif value != self.UNPROVIDED_VALUE: # partial provided date/time fields # specified with a tuple/list of 2 elements # (value, partial format string) value, dt_fmt = value for fmt_letter in dt_fmt: fmt = "%" + fmt_letter format_spec = format_spec.replace(fmt, value.strftime(fmt)) # Replace format spec with glob patterns (*, ?, etc) if not format_spec: return "*" if "%" in format_spec: replace_str = format_spec for fmt_key, fmt_val in DT_FMT.items(): replace_str = replace_str.replace(fmt_key, fmt_val) return replace_str if not re.search("[0-9]+", format_spec): # non-integer type return "*" return "?" * _get_number_from_fmt(format_spec)
globify_formatter = GlobifyFormatter()
[docs] def globify(fmt: str, keyvals: Mapping[str, Any] | None = None) -> Any: """Generate a string usable with glob.glob() from format string and provided information.""" if keyvals is None: keyvals = {} return globify_formatter.format(fmt, **keyvals)
[docs] def validate(fmt: str, stri: str) -> bool: """Validates that string ``stri`` conforms to ``fmt``. Useful for filtering string, or to check if string is compatible before passing the string to the parser function. """ try: parse(fmt, stri) return True except ValueError: return False
def _generate_data_for_format(fmt: str) -> dict[str, Any]: """Generate a fake data dictionary to fill in the provided format string.""" # finally try some data, create some random data for the fmt. data = {} # keep track of how many "free_size" (wildcard) parameters we have # if we get two in a row then we know the pattern is invalid, meaning # we'll never be able to match the second wildcard field free_size_start = False for literal_text, field_name, format_spec, _conversion in formatter.parse(fmt): if literal_text: free_size_start = False if not field_name: free_size_start = False continue # encapsulating free size keys, # e.g. {:s}{:s} or {:s}{:4s}{:d} if not format_spec or format_spec == "s" or format_spec == "d": if free_size_start: raise ValueError("Can't generate data for spec with two or more fields with no size specifier.") else: free_size_start = True # make some data for this key and format data[field_name] = _gen_data_for_spec(format_spec) return data def _gen_data_for_spec(format_spec: str | None) -> int | str | dt.datetime: if format_spec and "%" in format_spec: # some datetime t = dt.datetime.now() # run once through format to limit precision t = parse("{t:" + format_spec + "}", compose("{t:" + format_spec + "}", {"t": t}))["t"] return t if format_spec and "d" in format_spec: # random number (with n sign. figures) if not format_spec.isalpha(): n = _get_number_from_fmt(format_spec) else: # clearly bad raise ValueError(f"Bad format specification: {format_spec!r}") return random.randint(0, 99999999999999999) % (10**n) # string type if format_spec is None: n = 4 elif format_spec.isalnum(): n = _get_number_from_fmt(format_spec) else: n = 4 randstri = "" for _ in range(n): randstri += random.choice(string.ascii_letters) return randstri
[docs] def is_one2one(fmt: str) -> bool: """Check if the format string has a one to one correspondence. That is, that successive composing and parsing operations will result in the original data. In other words, that input data maps to a string, which then maps back to the original data without any change or loss in information. Note: This test only applies to sensible usage of the format string. If string or numeric data is causes overflow, e.g. if composing "abcd" into {3s}, one to one correspondence will always be broken in such cases. This of course also applies to precision losses when using datetime data. """ try: data = _generate_data_for_format(fmt) except ValueError: return False # run data forward once and back to data stri = compose(fmt, data) data2 = parse(fmt, stri) # check if data2 equal to original data if len(data) != len(data2): return False for key in data: if key not in data2: return False if data2[key] != data[key]: return False # all checks passed, so just return True return True
[docs] def purge() -> None: """Clear internal caches. Not needed normally, but can be used to force cache clear when memory is very limited. """ regex_format.cache_clear() get_convert_dict.cache_clear()
def _strict_compose(fmt: str, keyvals: Mapping[str, Any]) -> str: """Convert parameters in `keyvals` to a string based on `fmt` string.""" return formatter.format(fmt, **keyvals) def _partial_compose(fmt: str, keyvals: Mapping[str, Any]) -> str: """Convert parameters in `keyvals` to a string based on `fmt` string. Similar to _strict_compose, but accepts partial composing, i.e., not all parameters in `fmt` need to be specified in `keyvals`. Unspecified parameters are left unchanged. Args: fmt (str): Python format string to match against keyvals (dict): "Parameter --> parameter value" map """ fmt, undefined_vars = _replace_undefined_params_with_placeholders(fmt, keyvals) composed_string = _strict_compose(fmt=fmt, keyvals=keyvals) for fmt_placeholder, fmt_specification in undefined_vars.items(): composed_string = composed_string.replace(fmt_placeholder, fmt_specification) return composed_string def _replace_undefined_params_with_placeholders( fmt: str, keyvals: Mapping[str, Any] | None = None ) -> tuple[str, dict[str, Any]]: """Replace with placeholders params in `fmt` not specified in `keyvals`.""" vars_left_undefined = set(get_convert_dict(fmt).keys()) if keyvals is not None: vars_left_undefined -= keyvals.keys() undefined_vars_placeholders_dict = {} new_fmt = fmt for var in sorted(vars_left_undefined): matches = set(match.group() for match in re.finditer(rf"{{{re.escape(var)}([^\w{{}}].*?)*}}", new_fmt)) if len(matches) == 0: raise ValueError(f"Could not capture definitions for {var} from {fmt}") for var_specification in matches: fmt_placeholder = f"({hex(hash(var_specification))})" undefined_vars_placeholders_dict[fmt_placeholder] = var_specification new_fmt = new_fmt.replace(var_specification, fmt_placeholder) return new_fmt, undefined_vars_placeholders_dict