Source code for chemtools.basisparse

# -*- coding: utf-8 -*-

#The MIT License (MIT)
#
#Copyright (c) 2014 Lukasz Mentel
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

from __future__ import division, print_function

import json
import re

import numpy as np
from mendeleev import element

CFDTYPE = [('idx', np.int32), ('cc', np.float64)]
ORBITALS = ('s', 'p', 'd', 'f', 'g', 'h', 'i', 'k')


[docs]def get_l(shell):
    'Return the orbital angular momentum quantum number for a given subshell'

    if shell in ORBITALS:
        return ORBITALS.index(shell.lower())
    else:
        raise ValueError('"{}" is not a proper shell label'.format(shell))


[docs]def parse_basis(string, fmt=None):
    '''
    A wrapper for parsing the basis sets in different formats.

    Args:
      string : str
        A string with the basis set

      fmt : str
        Format in which the basis set is specified

    Returns:
      out : dict
        A dictionary of parsed basis sets with element symbols as keys and
        basis set functions as values
    '''

    if fmt == 'gamessus':
        return parse_gamessus_basis(string)
    elif fmt == 'gaussian':
        return parse_gaussian_basis(string)
    elif fmt == 'molpro':
        return parse_molpro_basis(string)
    else:
        formats = ['molpro', 'gamessus', 'gaussian']

        raise ValueError("Can only parse <fmt>: {}".format(", ".join(formats)))


[docs]def parse_molpro_basis(string):
    '''
    Parse basis set from a string in Molpro format.
    '''

    bas_re = re.compile(r'basis\s*=\s*\{(.*?)\}', flags=re.DOTALL | re.I)

    m = bas_re.search(string)
    if m:
        lines = m.group(1).split("\n")
    else:
        raise ValueError('basis string: "basis={*}" not found')

    start = [
        i
        for i, line in enumerate(lines)
        if line.split(",")[0].lower() in ORBITALS
    ]

    if not start:
        return None

    startstop = [(start[i], start[i + 1]) for i in range(len(start) - 1)]
    startstop.append((start[-1], len(lines)))

    bs = {}
    for i in startstop:
        at_symbol, shell = parse_molpro_shell(lines[i[0]],
                                              lines[i[0] + 1: i[1]])
        if at_symbol in bs:
            bs[at_symbol] = dict(list(bs[at_symbol].items()) +
                                 list(shell.items()))
        else:
            bs[at_symbol] = shell
    return bs


[docs]def parse_molpro_shell(expsline, coeffs):
    '''
    Parse functions of one shell in molpro format.
    '''

    real = re.compile(r'[dD]')
    # remove empty strings and whitespace line breaks and tabs
    coeffs = [x.strip() for x in coeffs if x.strip() not in ['', '\n', '\t']]

    fs = {}

    shell = expsline.split(",")[0].lower()
    at_symbol = expsline.split(",")[1].strip().capitalize()
    exps = np.array([float(real.sub('E', x))
                     for x in expsline.rstrip(";").split(",")[2:]])

    fs[shell] = {'e': exps, 'cf': []}
    if not coeffs:
        for i in range(len(exps)):
            fs[shell]['cf'].append(np.array([tuple([i, 1.0])], dtype=CFDTYPE))
    else:
        for line in coeffs:
            lsp = line.rstrip(";").split(",")
            if lsp[0] == "c":
                i, j = [int(x) for x in lsp[1].split(".")]
                coeffs = [float(real.sub('E', x)) for x in lsp[2:]]
                fs[shell]['cf'].append(np.array(list(zip(list(range(i - 1, j)),
                                                coeffs)), dtype=CFDTYPE))
    return at_symbol, fs


def parse_ecp(ecpstring):

    ecp_re = re.compile(r'\!\s*Effective core Potentials.*-{25}\s*\n(.*?)\n\s*\n', flags=re.DOTALL)

    lines = ecpstring.split("\n")

    start = [
        i
        for i, line in enumerate(lines)
        if line.split(",")[0].lower() == 'ecp'
    ]

    if not start:
        return None

    startstop = [(start[i], start[i + 1]) for i in range(len(start) - 1)]
    startstop.append((start[-1], len(lines)))

    ecp = {}
    for i in startstop:
        ecp = dict(list(ecp.items()) +
                   list(parse_coeffs(lines[i[0]: i[1]]).items()))
    return ecp


def parse_coeffs(lines):

    firstl = lines[0].replace(';', '').split(',')
    element = firstl[1].strip().capitalize()
    nele = firstl[2]
    lmax = firstl[3]

    liter = iter(x for x in lines[1:] if x != '')

    ecp = {element: {"nele": nele, "lmax": lmax, "shells": []}}

    while True:
        try:
            temp = next(liter)
        except StopIteration:
            break
        nlmax = int(temp.split(";")[0])
        comment = temp.split(";")[1].replace("!", "")
        tt = {'comment': comment, 'parameters': []}
        for _ in range(nlmax):
            param = next(liter).replace(";", "").split(",")
            tt['parameters'].append({'m': float(param[0]),
                                     'gamma': float(param[1]),
                                     'c': float(param[2])})
        ecp[element]['shells'].append(tt)
    return ecp


[docs]def parse_gaussian_basis(string):
    '''
    Parse the basis set into a list of dictionaries from a string in
    gaussian format.
    '''

    shellre = re.compile(r'^\s*(?P<shells>[SPDFGHILspdfghil]+)\s*(?P<nf>[1-9]+)\s*(?P<scale>\-?\d+\.\d+)')
    out = {}
    for i, item in enumerate(string.split('****\n')):
        if len(item) > 5:
            atomline, basis = item.split('\n', 1)
            atom = atomline.split()[0]
            bslines = basis.split('\n')
            functions = {}
            for i, line in enumerate(bslines):
                match = shellre.search(line)
                if match:
                    shells, nf, scale = match.group('shells').lower(), match.group('nf'), match.group('scale')
                    exps, indxs, coeffs = parse_gaussian_function(bslines[i + 1: i + int(nf) + 1])
                    for shell, cc in zip(shells, coeffs.T):
                        if shell in functions:
                            sexp, idxs, idxo = merge_exponents(functions[shell]['e'], exps)
                            functions[shell]['e'] = sexp * float(scale)**2
                            for cf in functions[shell]['cf']:
                                cf['idx'] = idxs[cf['idx']]
                            newcf = np.array(list(zip(idxo[indxs], cc)),
                                             dtype=CFDTYPE)
                            functions[shell]['cf'].append(newcf)
                        else:
                            functions[shell] = {'cf': [], 'e': exps}
                            functions[shell]['cf'].append(np.array(list(zip(indxs, cc)), dtype=CFDTYPE))
                out[atom] = functions
    return out


[docs]def parse_gaussian_function(lines):
    '''
    Parse a basis set function information from list of strings into
    three lists containg: exponents, indices, coefficients.

    Remeber that python doesn't recognise the `1.0d-3` format where `d` or
    `D` is used to the regex subsitution has to take care of that.
    '''

    real = re.compile(r'[dD]')

    indxs = np.arange(len(lines), dtype=np.int32)
    exps = np.array([float(real.sub('E', line.split()[0])) for line in lines],
                    dtype=np.float64)
    coeffs = np.array([[float(real.sub('E', x))
                        for x in line.split()[1:]] for line in lines],
                      dtype=np.float64)
    return (exps, indxs, coeffs)


[docs]def parse_gamessus_basis(string):
    '''
    Parse the basis set into a list of dictionaries from a string in
    gamess format.
    '''

    bas_re = re.compile(r'\$DATA\n(.*?)\$END', flags=re.DOTALL | re.IGNORECASE)

    m = bas_re.search(string)
    if m:
        basisstr = m.group(1)
    else:
        raise ValueError("basis not found, should be inside '$DATA' and '$END' group")

    pat = re.compile(r'^\s*(?P<shells>[SPDFGHILspdfghil]+)\s*(?P<nf>[0-9]+)')
    res = {}

    for item in basisstr.split('\n\n'):
        if len(item) > 0:
            atom, basis = item.split('\n', 1)
            if atom != "":
                functions = {}
                elem = element(atom.capitalize())
                bslines = basis.split("\n")
                for i, line in enumerate(bslines):
                    match = pat.search(line)
                    if match:
                        shells, nf = match.group("shells").lower(), match.group("nf")
                        exps, indxs, coeffs = parse_gamessus_function(bslines[i + 1: i + int(nf) + 1])
                        if shells in ['L', 'l']:
                            shells = ['s', 'p']
                        for shell, cc in zip(shells, coeffs.T):
                            if shell in functions:
                                sexp, idxs, idxo = merge_exponents(functions[shell]['e'], exps)
                                functions[shell]['e'] = sexp
                                for cf in functions[shell]['cf']:
                                    cf['idx'] = idxs[cf['idx']]
                                newcf = np.array(list(zip(idxo[indxs-1], cc)),
                                                 dtype=CFDTYPE)
                                functions[shell]['cf'].append(newcf)
                            else:
                                functions[shell] = {'cf': [], 'e': exps}
                                functions[shell]['cf'].append(np.array(list(zip(indxs - 1, cc)), dtype=CFDTYPE))
                res[elem.symbol] = functions
    return res


[docs]def parse_gamessus_function(lines):
    '''
    Parse a basis set function information from list of strings into
    three lists containg: exponents, indices, coefficients.

    Remeber that python doesn't recognise the `1.0d-3` format where `d` or
    `D` is used to the regex subsitution has to take care of that.
    '''

    real = re.compile(r'[dD]')

    indxs = np.array([int(line.split()[0]) for line in lines], dtype=np.int32)
    exps = np.array([float(real.sub('E', line.split()[1])) for line in lines],
                    dtype=np.float64)
    coeffs = np.array([[float(real.sub('E', x)) for x in line.split()[2:]]
                        for line in lines], dtype=np.float64)
    return (exps, indxs, coeffs)


[docs]def merge_exponents(a, b):
    '''
    Concatenate the arrays `a` and `b` using only the unique items from both
    arrays

    Args:
      a : numpy.array
      b : numpy.array

    Returns:
      res : 3-tuple of numpy arrays
        - res[0] sorted union of `a` and `b`
        - res[1] indices of `a` items in res[0]
        - res[2] indices of `b` items in res[0]
    '''

    u = np.union1d(a, b)[::-1]
    idxa = np.where(np.in1d(u, a))[0]
    idxb = np.where(np.in1d(u, b))[0]
    return u, idxa, idxb


[docs]class NumpyEncoder(json.JSONEncoder):

[docs]    def default(self, obj):
        '''
        If input object is an ndarray it will be converted into a dict
        holding the data and dtype.
        '''

        if isinstance(obj, np.ndarray):

            dtype = 'CFDTYPE' if obj.dtype == CFDTYPE else str(obj.dtype)
            return dict(data=obj.tolist(), dtype=dtype)
        # Let the base class default method raise the TypeError
        return json.JSONEncoder.default(self, obj)