Source code for yoda.search

#! /usr/bin/env python

import re


## A tool for filtering AO collections by path patterns

[docs]
def match_aos(aos, patts, unpatts=None, search=False):
    """
    Filter a list of analysis objects to those which match given path-matching patterns.

    @a patts is a regex or iterable of regexes for positive matching, i.e. retention;
    @a unpatts is the equivalent for negative matching, i.e. rejection even if a patt matches.

    @a search will use Python regex search mode rather than match mode, i.e. match if any
    part of the path fits the regex, rather than requiring a match from the start of the path.
    """

    rtn = []

    ## Internally we just use the list of AOs, not dict keys
    if type(aos) is dict:
        aos = list(aos.values()) #< TODO: maybe iter would be ok?

    ## Normalise the pattern arg inputs
    if patts and type(patts) is str:
        patts = [patts]
    re_patts = [re.compile(p) for p in patts] if patts else None
    if unpatts and type(unpatts) is str:
        unpatts = [unpatts]
    re_unpatts = [re.compile(p) for p in unpatts] if unpatts else None

    ## Apply pattern matching to each AO
    for ao in aos:
        match = False
        if re_patts:
            for rp in re_patts:
                if (not search and rp.match(ao.path)) or (search and rp.search(ao.path)):
                    match = True
                    break
        if match and re_unpatts:
            for rp in re_unpatts:
                if (not search and rp.match(ao.path)) or (search and rp.search(ao.path)):
                    match = False
                    break
        if match:
            rtn.append(ao)

    return rtn





[docs]
class PointMatcher(object):
    """\
    System for selecting subsets of bins based on a search range
    syntax extended from Professor weight files:

    Path structures:
     - /path/parts/to/histo[syst_variation]@xmin:xmax
     - /path/parts/to/histo[syst_variation]#nmin:nmax

    TODO: Extend to multi-dimensional ranges i.e. @xmin:xmax,#nymin:nymax,...
    """

    def __init__(self, patt):
        self.re_patt = re.compile(r"([^#@]+)(#\d+|@[\d\.:]+)?")
        self.set_patt(patt)


[docs]
    def set_patt(self, patt):
        "Find path and index/pos parts of patt and assign them to object attrs"
        self.patt = None
        self.path = None
        self.indextype = None
        self.index = None
        if not patt:
            return
        ## Strip separated comments
        patt = re.sub(r"(^|\s+)#.*", "", patt)
        self.patt = patt.strip()
        match = self.re_patt.match(self.patt)
        if match:
            self.path = re.compile(match.group(1))
            if match.group(2):
                # TODO: handle mixed-type ranges?
                self.indextype, indexstr = match.group(2)[0], match.group(2)[1:]
                if self.indextype:
                    if not ":" in indexstr:
                        self.index = float(indexstr)
                    else:
                        indexstr2 = indexstr.split(":", 1)
                        if not indexstr2[0]: indexstr2[0] = "-inf"
                        if not indexstr2[1]: indexstr2[1] = "inf"
                        self.index = [float(istr) for istr in indexstr2]


    def match_path(self, path):
        return self.path.match(path) is not None

    def search_path(self, path):
        return self.path.search(path) is not None


[docs]
    def match_pos(self, p):
        """Decide if a given point p is in the match range.

        p must be an object with attrs xmin, xmax, n

        TODO: Use open ranges to include underflow and overflow

        TODO: Allow negative indices in Python style, and use index=-1
        to mean the N+1 index needed to include the last bin without
        picking up the overflow, too.

        TODO: Extension to multiple dimensions
        """
        if not self.indextype:
            accept = True
        elif self.indextype == "#":
            if type(self.index) is float:
                accept = (p.n == int(self.index))
            else:
                accept = (p.n >= self.index[0] and p.n < self.index[1])
        else: # self.indextype == "@"
            if type(self.index) is float:
                accept = (self.index >= p.xmin and self.index < p.xmax)
            else:
                accept = (p.xmax > self.index[0] and p.xmin <= self.index[1])
        return accept


    def __repr__(self):
        s = "PointMatcher('%s' %s %s %s)" % (self.patt, self.path, self.indextype, self.index)
        return s



if __name__ == "__main__":

    from . import linspace, logspace
    from pprint import pprint
    import math

    class Point(object):
        def __init__(self, path, n, xmin, xmax, value=None):
            self.path = path
            self.n = n #< bin index
            self.xmin = xmin
            self.xmax = xmax
            self.value = value
        def __repr__(self):
            val = " = {}".format(self.value) if self.value else ""
            return "Point({} #{:d}  {:.2e}--{:.2e}{})".format(self.path, self.n, self.xmin, self.xmax, val)

    POINTS = []
    xs = linspace(5, 0, 10)
    POINTS += [Point("/foo", i, xs[i], xs[i+1]) for i in range(len(xs)-1)]
    xs = linspace(50, 0, 100)
    POINTS += [Point("/bar", i, xs[i], xs[i+1]) for i in range(len(xs)-1)]
    xs = linspace(20, 0, math.pi)
    POINTS += [Point("/baz/pi", i, xs[i], xs[i+1]) for i in range(len(xs)-1)]
    xs = logspace(20, 0.1, 50)
    POINTS += [Point("/baz/log", i, xs[i], xs[i+1]) for i in range(len(xs)-1)]
    pprint(POINTS)

    pms = [PointMatcher("/foo   # this bit is a comment"),
           PointMatcher("/bar#1  # this bit is a comment"),
           PointMatcher("/baz/pi@2.76  # this bit is a comment"),
           PointMatcher("/baz/.*@:1.32 # this bit is a comment"),
           PointMatcher("/baz/.*@2.76: # this bit is a comment")]
    pprint(pms)

    for pm in pms:
        print()
        print(pm.patt, pm.indextype, pm.index)
        pprint( [p for p in POINTS if pm.match_path(p.path) and pm.match_pos(p)] )