Source code for chemtools.parsetools

# -*- coding: utf-8 -*-

#The MIT License (MIT)
#
#Copyright (c) 2014 Lukasz Mentel
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

'''
Module with convenience functions for parsing files
'''

from __future__ import print_function

import itertools
from collections import defaultdict


[docs]def contains(string, query):
    'Check if `string` contains `query`'
    return string.find(query) > -1


[docs]def locatelinenos(filename, tolocate):
    '''
    Given a file and a list of strings return a dict with string as keys
    and line numbers in which they appear a values.

    Args:
      filename : str
        Name of the file
      tolocate : list of tuples
        List of tuples with strings to find (queries) as first elements and
        integer offset values as second

    Returns:
      out : dict
        Dictionary whose keys are indices corresponding to item in input list
        and values are lists of line numbers in which those string appear

    TODO:
        - add option to ignore the case of the strings to search
    '''

    out = defaultdict(list)
    for lineno, line in enumerate(open(filename, 'r')):
        for idx, (query, offset) in enumerate(tolocate):
            if contains(line, query):
                out[idx].append(lineno + offset)
    return out


[docs]def getlines(filename, tolocate):
    '''
    Return the lines from the files based on `tolocate`

    Args:
      filename : str
        Name of the file
      tolocate : list of tuples
        List of tuples with strings to find (queries) as first elements and
        integer offset values as second

    Return:
    '''

    located = locatelinenos(filename, tolocate)

    if len(tolocate) == len(located):
        for k, v in located.items():
            if len(v) > 1:
                raise ValueError('multiple lines found for "{0}": {1}'.format(
                    tolocate[k][0], ', '.join([str(x) for x in v])))

        startlno = min(list(itertools.chain(*located.values())))
        endlno = max(list(itertools.chain(*located.values())))
        return getchunk(filename, startlno, endlno)
    else:
        # TODO: this needs to be corrected to be more informative
        raise ValueError('len(tolocate) != len(located): {0} != {1}'.format(
            len(tolocate), len(located)))


[docs]def getchunk(filename, startlno, endlno):
    '''
    Get a list of lines from a file between specified line numbers `startlno`
    and `endlno`.

    Args:
      filename : str
        Name of the file to process
      startlno : int
        Number of the first line to obtain
      endlno : int
        Number of the last line to obtain

    Returns:
      lines : list
        A list of lines from the file `filename` between line numbers\
        `startlno` and `endlno`
    '''

    fobj = open(filename, 'r')
    fileiter = iter(fobj)

    for _ in range(startlno):
        next(fileiter)

    return [next(fileiter) for _ in range(endlno - startlno)]


[docs]def take(seq, num):
    '''
    Iterate over a sequence `seq` `num` times and return the list of the
    elements iterated over.
    '''
    return [next(seq) for _ in range(num)]


[docs]def parsepairs(los, sep="="):
    '''
    Parse a given list of strings "los" into a dictionary based on
    separation by "sep" character and return the dictionary.
    '''

    out = []
    for line in los:
        if sep in line:
            (name, value) = line.split(sep)
            out.append((name.strip(), float(value)))
    return dict(out)


[docs]def sliceafter(seq, item, num):
    '''
    Return "num" elements of a sequence "seq" present after the item "item".
    '''

    it = iter(seq)
    for element in it:
        if item in element:
            return [next(it) for _ in range(num)]


[docs]def slicebetween(string, start, end):
    '''
    Return a slice of the `string` between phrases `start` and `end`.
    '''

    istart = string.index(start)
    iend = string[istart:].index(end)
    return string[istart + len(start):istart + iend]