Source code for trollsift.parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2014-2022 Trollsift Developers
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Main parsing and formatting functionality."""
import re
import datetime as dt
import random
import string
from functools import lru_cache
[docs]class Parser(object):
"""Class-based interface to parsing and formatting functionality."""
def __init__(self, fmt):
self.fmt = fmt
def __str__(self):
return self.fmt
[docs] def keys(self):
"""Get parameter names defined in the format string."""
convert_dict = get_convert_dict(self.fmt)
return convert_dict.keys()
[docs] def parse(self, stri, full_match=True):
'''Parse keys and corresponding values from *stri* using format
described in *fmt* string.
'''
return parse(self.fmt, stri, full_match=full_match)
[docs] def compose(self, keyvals, allow_partial=False):
"""Compose format string *self.fmt* with parameters given in the *keyvals* dict.
Args:
keyvals (dict): "Parameter --> parameter value" map
allow_partial (bool): If True, then partial composition is allowed, i.e.,
not all parameters present in `fmt` need to be specified in `keyvals`.
Unspecified parameters will, in this case, be left unchanged.
(Default value = False).
Returns:
str: Result of formatting the *self.fmt* string with parameter values
extracted from the corresponding items in the *keyvals* dictionary.
"""
return compose(fmt=self.fmt, keyvals=keyvals, allow_partial=allow_partial)
format = compose
[docs] def globify(self, keyvals=None):
'''Generate a string useable with glob.glob() from format string
*fmt* and *keyvals* dictionary.
'''
return globify(self.fmt, keyvals)
[docs] def validate(self, stri):
"""
Validates that string *stri* is parsable and therefore complies with
this string format definition. Useful for filtering strings, or to
check if a string if compatible before passing it to the
parser function.
"""
return validate(self.fmt, stri)
[docs] def is_one2one(self):
"""
Runs a check to evaluate if this format string has a
one to one correspondence. I.e. that successive composing and
parsing opperations will result in the original data.
In other words, that input data maps to a string,
which then maps back to the original data without any change
or loss in information.
Note: This test only applies to sensible usage of the format string.
If string or numeric data is causes overflow, e.g.
if composing "abcd" into {3s}, one to one correspondence will always
be broken in such cases. This off course also applies to precision
losses when using datetime data.
"""
return is_one2one(self.fmt)
[docs]class StringFormatter(string.Formatter):
"""Custom string formatter class for basic strings.
This formatter adds a few special conversions for assisting with common
trollsift situations like making a parameter lowercase or removing
hyphens. The added conversions are listed below and can be used in a
format string by prefixing them with an `!` like so:
>>> fstr = "{!u}_{!l}"
>>> formatter = StringFormatter()
>>> formatter.format(fstr, "to_upper", "To_LowerCase")
"TO_UPPER_to_lowercase"
- c: Make capitalized version of string (first character upper case, all lowercase after that) by executing the
parameter's `.capitalize()` method.
- l: Make all characters lowercase by executing the parameter's `.lower()` method.
- R: Remove all separators from the parameter including '-', '_', ' ', and ':'.
- t: Title case the string by executing the parameter's `.title()` method.
- u: Make all characters uppercase by executing the parameter's `.upper()` method.
- h: A combination of 'R' and 'l'.
- H: A combination of 'R' and 'u'.
"""
CONV_FUNCS = {
'c': 'capitalize',
'h': 'lower',
'H': 'upper',
'l': 'lower',
't': 'title',
'u': 'upper'
}
[docs] def convert_field(self, value, conversion):
"""Apply conversions mentioned above."""
func = self.CONV_FUNCS.get(conversion)
if func is not None:
value = getattr(value, func)()
elif conversion not in ['R']:
# default conversion ('r', 's')
return super(StringFormatter, self).convert_field(value, conversion)
if conversion in ['h', 'H', 'R']:
value = value.replace('-', '').replace('_', '').replace(':', '').replace(' ', '')
return value
formatter = StringFormatter()
# taken from https://docs.python.org/3/library/re.html#simulating-scanf
spec_regexes = {
'b': r'[-+]?[0-1]',
'c': r'.',
'd': r'[-+]?\d',
'f': {
# Naive fixed point format specifier (e.g. {foo:f})
'naive': r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?',
# Fixed point format specifier including width and precision
# (e.g. {foo:4.2f}). The lookahead (?=.{width}) makes sure that the
# subsequent pattern is only matched if the string has the required
# (minimum) width.
'precision': r'(?=.{{{width}}})([-+]?([\d ]+(\.\d{{{decimals}}})+|\.\d{{{decimals}}})([eE][-+]?\d+)?)'
},
'i': r'[-+]?(0[xX][\dA-Fa-f]+|0[0-7]*|\d+)',
'o': r'[-+]?[0-7]',
's': r'\S',
'x': r'[-+]?(0[xX])?[\dA-Fa-f]',
}
spec_regexes['e'] = spec_regexes['f']
spec_regexes['E'] = spec_regexes['f']
spec_regexes['g'] = spec_regexes['f']
spec_regexes['X'] = spec_regexes['x']
spec_regexes[''] = spec_regexes['s']
allow_multiple = ['b', 'c', 'd', 'o', 's', '', 'x', 'X']
fixed_point_types = ['f', 'e', 'E', 'g']
# format_spec ::= [[fill]align][sign][#][0][width][,][.precision][type]
# https://docs.python.org/3.4/library/string.html#format-specification-mini-language
fmt_spec_regex = re.compile(
r'(?P<align>(?P<fill>.)?[<>=^])?(?P<sign>[\+\-\s])?(?P<pound>#)?(?P<zero>0)?(?P<width>\d+)?'
r'(?P<comma>,)?(?P<precision>.\d+)?(?P<type>[bcdeEfFgGnosxX%]?)')
def _get_fixed_point_regex(regex_dict, width, precision):
"""Get regular expression for fixed point numbers.
Args:
width: Total width of the string representation.
precision: Number of decimals.
"""
if width or precision:
if precision is None:
precision = '0,'
else:
precision = precision.strip('.')
if width is None:
width = '1,'
return regex_dict['precision'].format(
width=width, decimals=precision)
else:
return regex_dict['naive']
[docs]class RegexFormatter(string.Formatter):
"""String formatter that converts a format string to a regular expression.
>>> regex_formatter = RegexFormatter()
>>> regex_str = regex_formatter.format('{field_one:5d}_{field_two}')
Can also be used to extract values from a string given the format spec
for that string:
>>> regex_formatter.extract_values('{field_one:5d}_{field_two}', '12345_sometext')
{'field_one': '12345', 'field_two': 'sometext'}
Note that the regular expressions generated by this class are specially
generated to reduce "greediness" of the matches found. For ambiguous
patterns where a single field could match shorter or longer portions of
the provided string, this class will prefer the shorter version of the
string in order to make the rest of the pattern match. For example:
>>> regex_formatter.extract_values('{field_one}_{field_two}', 'abc_def_ghi')
{'field_one': 'abc', 'field_two': 'def_ghi'}
Note how `field_one` could have matched "abc_def", but the lower
greediness of this parser caused it to only match against "abc".
"""
# special string to mark a parameter not being specified
UNPROVIDED_VALUE = '<trollsift unprovided value>'
ESCAPE_CHARACTERS = ['\\'] + [x for x in string.punctuation if x not in '\\%']
ESCAPE_SETS = [(c, '\\' + c) for c in ESCAPE_CHARACTERS]
def __init__(self):
# hold on to fields we've seen already so we can reuse their
# definitions in the regex
self._cached_fields = {}
super(RegexFormatter, self).__init__()
[docs] @lru_cache()
def format(*args, **kwargs):
try:
# super() doesn't seem to work here
ret_val = string.Formatter.format(*args, **kwargs)
finally:
self = args[0] # just matching the parent class
self._cached_fields.clear()
return ret_val
def _escape(self, s):
"""Escape bad characters for regular expressions.
Similar to `re.escape` but allows '%' to pass through.
"""
for ch, r_ch in self.ESCAPE_SETS:
s = s.replace(ch, r_ch)
return s
[docs] def parse(self, format_string):
parse_ret = super(RegexFormatter, self).parse(format_string)
for literal_text, field_name, format_spec, conversion in parse_ret:
# the parent class will call parse multiple times moving
# 'format_spec' to 'literal_text'. We only escape 'literal_text'
# so we don't escape things twice.
literal_text = self._escape(literal_text)
yield literal_text, field_name, format_spec, conversion
[docs] def get_value(self, key, args, kwargs):
try:
return super(RegexFormatter, self).get_value(key, args, kwargs)
except (IndexError, KeyError):
return key, self.UNPROVIDED_VALUE
def _regex_datetime(self, format_spec):
replace_str = format_spec
for fmt_key, fmt_val in DT_FMT.items():
if fmt_key == '%%':
# special case
replace_str.replace('%%', '%')
continue
count = fmt_val.count('?')
# either a series of numbers or letters/numbers
regex = r'\d{{{:d}}}'.format(count) if count else r'[^ \t\n\r\f\v\-_:]+'
replace_str = replace_str.replace(fmt_key, regex)
return replace_str
[docs] @staticmethod
def format_spec_to_regex(field_name, format_spec):
"""Make an attempt at converting a format spec to a regular expression."""
# NOTE: remove escaped backslashes so regex matches
regex_match = fmt_spec_regex.match(format_spec.replace('\\', ''))
if regex_match is None:
raise ValueError("Invalid format specification: '{}'".format(format_spec))
regex_dict = regex_match.groupdict()
fill = regex_dict['fill']
ftype = regex_dict['type']
width = regex_dict['width']
align = regex_dict['align']
precision = regex_dict['precision']
# NOTE: does not properly handle `=` alignment
if fill is None:
if width is not None and width[0] == '0':
fill = '0'
elif ftype in ['s', '', 'd', 'x', 'X', 'o', 'b']:
fill = ' '
char_type = spec_regexes[ftype]
if ftype in fixed_point_types:
char_type = _get_fixed_point_regex(
char_type,
width=width,
precision=precision
)
if ftype in ('s', '') and align and align.endswith('='):
raise ValueError("Invalid format specification: '{}'".format(format_spec))
final_regex = char_type
if ftype in allow_multiple and (not width or width == '0'):
final_regex += r'*?'
elif width and width != '0':
if not fill and ftype not in fixed_point_types:
# we know we have exactly this many characters
final_regex += r'{{{}}}'.format(int(width))
elif fill:
# we don't know how many fill characters we have compared to
# field characters so just match all characters and sort it out
# later during type conversion.
final_regex = r'.{{{}}}'.format(int(width))
elif ftype in allow_multiple:
final_regex += r'*?'
return r'(?P<{}>{})'.format(field_name, final_regex)
[docs] def regex_field(self, field_name, value, format_spec):
if value != self.UNPROVIDED_VALUE:
return super(RegexFormatter, self).format_field(value, format_spec)
if self._cached_fields.get(field_name, format_spec) != format_spec:
raise ValueError("Can't specify the same field_name with "
"different formats: {}".format(field_name))
elif field_name in self._cached_fields:
return r'(?P={})'.format(field_name)
else:
self._cached_fields[field_name] = format_spec
# Replace format spec with glob patterns (*, ?, etc)
if not format_spec:
return r'(?P<{}>.*?)'.format(field_name)
if '%' in format_spec:
return r'(?P<{}>{})'.format(field_name, self._regex_datetime(format_spec))
return self.format_spec_to_regex(field_name, format_spec)
[docs] def format_field(self, value, format_spec):
if not isinstance(value, tuple) or value[1] != self.UNPROVIDED_VALUE:
return super(RegexFormatter, self).format_field(value, format_spec)
field_name, value = value
return self.regex_field(field_name, value, format_spec)
[docs]@lru_cache()
def regex_format(fmt):
# We create a new instance of RegexFormatter here to prevent concurrent calls to
# format interfering with one another.
return RegexFormatter().format(fmt)
[docs]def extract_values(fmt, stri, full_match=True):
"""Extract information from string matching format.
Args:
fmt (str): Python format string to match against
stri (str): String to extract information from
full_match (bool): Force the match of the whole string. Default
to ``True``.
"""
regex = regex_format(fmt)
if full_match:
regex = '^' + regex + '$'
match = re.match(regex, stri)
if match is None:
raise ValueError("String does not match pattern.")
return match.groupdict()
def _get_number_from_fmt(fmt):
"""Helper function for extract_values.
Figures out string length from format string.
"""
if '%' in fmt:
# its datetime
return len(("{0:" + fmt + "}").format(dt.datetime.now()))
else:
# its something else
fmt = fmt.lstrip('0')
return int(re.search('[0-9]+', fmt).group(0))
def _convert(convdef, stri):
"""Convert the string *stri* to the given conversion definition *convdef*."""
if '%' in convdef:
result = dt.datetime.strptime(stri, convdef)
else:
result = _strip_padding(convdef, stri)
if 'd' in convdef:
result = int(result)
elif 'x' in convdef or 'X' in convdef:
result = int(result, 16)
elif 'o' in convdef:
result = int(result, 8)
elif 'b' in convdef:
result = int(result, 2)
elif any(float_type_marker in convdef for float_type_marker in fixed_point_types):
result = float(result)
return result
def _strip_padding(convdef, stri):
"""Strip padding from the given string.
Args:
convdef: Conversion definition (indicates the padding)
stri: String to be modified
"""
regex_match = fmt_spec_regex.match(convdef)
match_dict = regex_match.groupdict() if regex_match else {}
align = match_dict.get('align')
pad = match_dict.get('fill')
if align:
# align character is the last one
align = align[-1]
if align and align in '<>^' and not pad:
pad = ' '
if align == '>':
stri = stri.lstrip(pad)
elif align == '<':
stri = stri.rstrip(pad)
elif align == '^':
stri = stri.strip(pad)
return stri
[docs]@lru_cache()
def get_convert_dict(fmt):
"""Retrieve parse definition from the format string `fmt`."""
convdef = {}
for literal_text, field_name, format_spec, conversion in formatter.parse(fmt):
if field_name is None:
continue
# XXX: Do I need to include 'conversion'?
convdef[field_name] = format_spec
return convdef
[docs]def parse(fmt, stri, full_match=True):
"""Parse keys and corresponding values from *stri* using format described in *fmt* string.
Args:
fmt (str): Python format string to match against
stri (str): String to extract information from
full_match (bool): Force the match of the whole string. Default
True.
"""
convdef = get_convert_dict(fmt)
keyvals = extract_values(fmt, stri, full_match=full_match)
for key in convdef.keys():
keyvals[key] = _convert(convdef[key], keyvals[key])
return keyvals
[docs]def compose(fmt, keyvals, allow_partial=False):
"""Compose format string *self.fmt* with parameters given in the *keyvals* dict.
Args:
fmt (str): Python format string to match against
keyvals (dict): "Parameter --> parameter value" map
allow_partial (bool): If True, then partial composition is allowed, i.e.,
not all parameters present in `fmt` need to be specified in `keyvals`.
Unspecified parameters will, in this case, be left unchanged.
(Default value = False).
Returns:
str: Result of formatting the *self.fmt* string with parameter values
extracted from the corresponding items in the *keyvals* dictionary.
"""
if allow_partial:
return _partial_compose(fmt=fmt, keyvals=keyvals)
return _strict_compose(fmt=fmt, keyvals=keyvals)
DT_FMT = {
"%a": "*",
"%A": "*",
"%w": "?",
"%d": "??",
"%b": "*",
"%B": "*",
"%m": "??",
"%y": "??",
"%Y": "????",
"%H": "??",
"%I": "??",
"%p": "*",
"%M": "??",
"%S": "??",
"%f": "*",
"%z": "*",
"%Z": "*",
"%j": "???",
"%U": "??",
"%W": "??",
"%c": "*",
"%x": "*",
"%X": "*",
"%%": "?"
}
[docs]class GlobifyFormatter(string.Formatter):
# special string to mark a parameter not being specified
UNPROVIDED_VALUE = '<trollsift unprovided value>'
[docs] def get_value(self, key, args, kwargs):
try:
return super(GlobifyFormatter, self).get_value(key, args, kwargs)
except (IndexError, KeyError):
# assumes that
return self.UNPROVIDED_VALUE
[docs] def format_field(self, value, format_spec):
if not isinstance(value, (list, tuple)) and value != self.UNPROVIDED_VALUE:
return super(GlobifyFormatter, self).format_field(value, format_spec)
elif value != self.UNPROVIDED_VALUE:
# partial provided date/time fields
# specified with a tuple/list of 2 elements
# (value, partial format string)
value, dt_fmt = value
for fmt_letter in dt_fmt:
fmt = '%' + fmt_letter
format_spec = format_spec.replace(fmt, value.strftime(fmt))
# Replace format spec with glob patterns (*, ?, etc)
if not format_spec:
return '*'
if '%' in format_spec:
replace_str = format_spec
for fmt_key, fmt_val in DT_FMT.items():
replace_str = replace_str.replace(fmt_key, fmt_val)
return replace_str
if not re.search('[0-9]+', format_spec):
# non-integer type
return '*'
return '?' * _get_number_from_fmt(format_spec)
globify_formatter = GlobifyFormatter()
[docs]def globify(fmt, keyvals=None):
"""Generate a string usable with glob.glob() from format string
*fmt* and *keyvals* dictionary.
"""
if keyvals is None:
keyvals = {}
return globify_formatter.format(fmt, **keyvals)
[docs]def validate(fmt, stri):
"""
Validates that string *stri* is parsable and therefore complies with
the format string, *fmt*. Useful for filtering string, or to
check if string if compatible before passing the string to the
parser function.
"""
try:
parse(fmt, stri)
return True
except ValueError:
return False
def _generate_data_for_format(fmt):
"""Generate a fake data dictionary to fill in the provided format string."""
# finally try some data, create some random data for the fmt.
data = {}
# keep track of how many "free_size" (wildcard) parameters we have
# if we get two in a row then we know the pattern is invalid, meaning
# we'll never be able to match the second wildcard field
free_size_start = False
for literal_text, field_name, format_spec, conversion in formatter.parse(fmt):
if literal_text:
free_size_start = False
if not field_name:
free_size_start = False
continue
# encapsulating free size keys,
# e.g. {:s}{:s} or {:s}{:4s}{:d}
if not format_spec or format_spec == "s" or format_spec == "d":
if free_size_start:
return None
else:
free_size_start = True
# make some data for this key and format
if format_spec and '%' in format_spec:
# some datetime
t = dt.datetime.now()
# run once through format to limit precision
t = parse(
"{t:" + format_spec + "}", compose("{t:" + format_spec + "}", {'t': t}))['t']
data[field_name] = t
elif format_spec and 'd' in format_spec:
# random number (with n sign. figures)
if not format_spec.isalpha():
n = _get_number_from_fmt(format_spec)
else:
# clearly bad
return None
data[field_name] = random.randint(0, 99999999999999999) % (10 ** n)
else:
# string type
if format_spec is None:
n = 4
elif format_spec.isalnum():
n = _get_number_from_fmt(format_spec)
else:
n = 4
randstri = ''
for x in range(n):
randstri += random.choice(string.ascii_letters)
data[field_name] = randstri
return data
[docs]def is_one2one(fmt):
"""
Runs a check to evaluate if the format string has a
one to one correspondence. I.e. that successive composing and
parsing opperations will result in the original data.
In other words, that input data maps to a string,
which then maps back to the original data without any change
or loss in information.
Note: This test only applies to sensible usage of the format string.
If string or numeric data is causes overflow, e.g.
if composing "abcd" into {3s}, one to one correspondence will always
be broken in such cases. This of course also applies to precision
losses when using datetime data.
"""
data = _generate_data_for_format(fmt)
if data is None:
return False
# run data forward once and back to data
stri = compose(fmt, data)
data2 = parse(fmt, stri)
# check if data2 equal to original data
if len(data) != len(data2):
return False
for key in data:
if key not in data2:
return False
if data2[key] != data[key]:
return False
# all checks passed, so just return True
return True
[docs]def purge():
"""Clear internal caches.
Not needed normally, but can be used to force cache clear when memory
is very limited.
"""
regex_format.cache_clear()
get_convert_dict.cache_clear()
def _strict_compose(fmt, keyvals):
"""Convert parameters in `keyvals` to a string based on `fmt` string."""
return formatter.format(fmt, **keyvals)
def _partial_compose(fmt, keyvals):
"""Convert parameters in `keyvals` to a string based on `fmt` string.
Similar to _strict_compose, but accepts partial composing, i.e., not all
parameters in `fmt` need to be specified in `keyvals`. Unspecified parameters
are left unchanged.
Args:
fmt (str): Python format string to match against
keyvals (dict): "Parameter --> parameter value" map
"""
fmt, undefined_vars = _replace_undefined_params_with_placeholders(fmt, keyvals)
composed_string = _strict_compose(fmt=fmt, keyvals=keyvals)
for fmt_placeholder, fmt_specification in undefined_vars.items():
composed_string = composed_string.replace(fmt_placeholder, fmt_specification)
return composed_string
def _replace_undefined_params_with_placeholders(fmt, keyvals=None):
"""Replace with placeholders params in `fmt` not specified in `keyvals`."""
vars_left_undefined = get_convert_dict(fmt).keys()
if keyvals is not None:
vars_left_undefined -= keyvals.keys()
undefined_vars_placeholders_dict = {}
new_fmt = fmt
for var in sorted(vars_left_undefined):
matches = set(
match.group()
for match in re.finditer(rf"{{{re.escape(var)}([^\w{{}}].*?)*}}", new_fmt)
)
if len(matches) == 0:
raise ValueError(f"Could not capture definitions for {var} from {fmt}")
for var_specification in matches:
fmt_placeholder = f"({hex(hash(var_specification))})"
undefined_vars_placeholders_dict[fmt_placeholder] = var_specification
new_fmt = new_fmt.replace(var_specification, fmt_placeholder)
return new_fmt, undefined_vars_placeholders_dict