Source code for chemtools.basisparse

# -*- coding: utf-8 -*-

#The MIT License (MIT)
#
#Copyright (c) 2014 Lukasz Mentel
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

from __future__ import division, print_function

import json
import re

import numpy as np
from mendeleev import element

CFDTYPE = [('idx', np.int32), ('cc', np.float64)]
ORBITALS = ('s', 'p', 'd', 'f', 'g', 'h', 'i', 'k')


[docs]def get_l(shell): 'Return the orbital angular momentum quantum number for a given subshell' if shell in ORBITALS: return ORBITALS.index(shell.lower()) else: raise ValueError('"{}" is not a proper shell label'.format(shell))
[docs]def parse_basis(string, fmt=None): ''' A wrapper for parsing the basis sets in different formats. Args: string : str A string with the basis set fmt : str Format in which the basis set is specified Returns: out : dict A dictionary of parsed basis sets with element symbols as keys and basis set functions as values ''' if fmt == 'gamessus': return parse_gamessus_basis(string) elif fmt == 'gaussian': return parse_gaussian_basis(string) elif fmt == 'molpro': return parse_molpro_basis(string) else: formats = ['molpro', 'gamessus', 'gaussian'] raise ValueError("Can only parse <fmt>: {}".format(", ".join(formats)))
[docs]def parse_molpro_basis(string): ''' Parse basis set from a string in Molpro format. ''' bas_re = re.compile(r'basis\s*=\s*\{(.*?)\}', flags=re.DOTALL | re.I) m = bas_re.search(string) if m: lines = m.group(1).split("\n") else: raise ValueError('basis string: "basis={*}" not found') start = [ i for i, line in enumerate(lines) if line.split(",")[0].lower() in ORBITALS ] if not start: return None startstop = [(start[i], start[i + 1]) for i in range(len(start) - 1)] startstop.append((start[-1], len(lines))) bs = {} for i in startstop: at_symbol, shell = parse_molpro_shell(lines[i[0]], lines[i[0] + 1: i[1]]) if at_symbol in bs: bs[at_symbol] = dict(list(bs[at_symbol].items()) + list(shell.items())) else: bs[at_symbol] = shell return bs
[docs]def parse_molpro_shell(expsline, coeffs): ''' Parse functions of one shell in molpro format. ''' real = re.compile(r'[dD]') # remove empty strings and whitespace line breaks and tabs coeffs = [x.strip() for x in coeffs if x.strip() not in ['', '\n', '\t']] fs = {} shell = expsline.split(",")[0].lower() at_symbol = expsline.split(",")[1].strip().capitalize() exps = np.array([float(real.sub('E', x)) for x in expsline.rstrip(";").split(",")[2:]]) fs[shell] = {'e': exps, 'cf': []} if not coeffs: for i in range(len(exps)): fs[shell]['cf'].append(np.array([tuple([i, 1.0])], dtype=CFDTYPE)) else: for line in coeffs: lsp = line.rstrip(";").split(",") if lsp[0] == "c": i, j = [int(x) for x in lsp[1].split(".")] coeffs = [float(real.sub('E', x)) for x in lsp[2:]] fs[shell]['cf'].append(np.array(list(zip(list(range(i - 1, j)), coeffs)), dtype=CFDTYPE)) return at_symbol, fs
def parse_ecp(ecpstring): ecp_re = re.compile(r'\!\s*Effective core Potentials.*-{25}\s*\n(.*?)\n\s*\n', flags=re.DOTALL) lines = ecpstring.split("\n") start = [ i for i, line in enumerate(lines) if line.split(",")[0].lower() == 'ecp' ] if not start: return None startstop = [(start[i], start[i + 1]) for i in range(len(start) - 1)] startstop.append((start[-1], len(lines))) ecp = {} for i in startstop: ecp = dict(list(ecp.items()) + list(parse_coeffs(lines[i[0]: i[1]]).items())) return ecp def parse_coeffs(lines): firstl = lines[0].replace(';', '').split(',') element = firstl[1].strip().capitalize() nele = firstl[2] lmax = firstl[3] liter = iter(x for x in lines[1:] if x != '') ecp = {element: {"nele": nele, "lmax": lmax, "shells": []}} while True: try: temp = next(liter) except StopIteration: break nlmax = int(temp.split(";")[0]) comment = temp.split(";")[1].replace("!", "") tt = {'comment': comment, 'parameters': []} for _ in range(nlmax): param = next(liter).replace(";", "").split(",") tt['parameters'].append({'m': float(param[0]), 'gamma': float(param[1]), 'c': float(param[2])}) ecp[element]['shells'].append(tt) return ecp
[docs]def parse_gaussian_basis(string): ''' Parse the basis set into a list of dictionaries from a string in gaussian format. ''' shellre = re.compile(r'^\s*(?P<shells>[SPDFGHILspdfghil]+)\s*(?P<nf>[1-9]+)\s*(?P<scale>\-?\d+\.\d+)') out = {} for i, item in enumerate(string.split('****\n')): if len(item) > 5: atomline, basis = item.split('\n', 1) atom = atomline.split()[0] bslines = basis.split('\n') functions = {} for i, line in enumerate(bslines): match = shellre.search(line) if match: shells, nf, scale = match.group('shells').lower(), match.group('nf'), match.group('scale') exps, indxs, coeffs = parse_gaussian_function(bslines[i + 1: i + int(nf) + 1]) for shell, cc in zip(shells, coeffs.T): if shell in functions: sexp, idxs, idxo = merge_exponents(functions[shell]['e'], exps) functions[shell]['e'] = sexp * float(scale)**2 for cf in functions[shell]['cf']: cf['idx'] = idxs[cf['idx']] newcf = np.array(list(zip(idxo[indxs], cc)), dtype=CFDTYPE) functions[shell]['cf'].append(newcf) else: functions[shell] = {'cf': [], 'e': exps} functions[shell]['cf'].append(np.array(list(zip(indxs, cc)), dtype=CFDTYPE)) out[atom] = functions return out
[docs]def parse_gaussian_function(lines): ''' Parse a basis set function information from list of strings into three lists containg: exponents, indices, coefficients. Remeber that python doesn't recognise the `1.0d-3` format where `d` or `D` is used to the regex subsitution has to take care of that. ''' real = re.compile(r'[dD]') indxs = np.arange(len(lines), dtype=np.int32) exps = np.array([float(real.sub('E', line.split()[0])) for line in lines], dtype=np.float64) coeffs = np.array([[float(real.sub('E', x)) for x in line.split()[1:]] for line in lines], dtype=np.float64) return (exps, indxs, coeffs)
[docs]def parse_gamessus_basis(string): ''' Parse the basis set into a list of dictionaries from a string in gamess format. ''' bas_re = re.compile(r'\$DATA\n(.*?)\$END', flags=re.DOTALL | re.IGNORECASE) m = bas_re.search(string) if m: basisstr = m.group(1) else: raise ValueError("basis not found, should be inside '$DATA' and '$END' group") pat = re.compile(r'^\s*(?P<shells>[SPDFGHILspdfghil]+)\s*(?P<nf>[0-9]+)') res = {} for item in basisstr.split('\n\n'): if len(item) > 0: atom, basis = item.split('\n', 1) if atom != "": functions = {} elem = element(atom.capitalize()) bslines = basis.split("\n") for i, line in enumerate(bslines): match = pat.search(line) if match: shells, nf = match.group("shells").lower(), match.group("nf") exps, indxs, coeffs = parse_gamessus_function(bslines[i + 1: i + int(nf) + 1]) if shells in ['L', 'l']: shells = ['s', 'p'] for shell, cc in zip(shells, coeffs.T): if shell in functions: sexp, idxs, idxo = merge_exponents(functions[shell]['e'], exps) functions[shell]['e'] = sexp for cf in functions[shell]['cf']: cf['idx'] = idxs[cf['idx']] newcf = np.array(list(zip(idxo[indxs-1], cc)), dtype=CFDTYPE) functions[shell]['cf'].append(newcf) else: functions[shell] = {'cf': [], 'e': exps} functions[shell]['cf'].append(np.array(list(zip(indxs - 1, cc)), dtype=CFDTYPE)) res[elem.symbol] = functions return res
[docs]def parse_gamessus_function(lines): ''' Parse a basis set function information from list of strings into three lists containg: exponents, indices, coefficients. Remeber that python doesn't recognise the `1.0d-3` format where `d` or `D` is used to the regex subsitution has to take care of that. ''' real = re.compile(r'[dD]') indxs = np.array([int(line.split()[0]) for line in lines], dtype=np.int32) exps = np.array([float(real.sub('E', line.split()[1])) for line in lines], dtype=np.float64) coeffs = np.array([[float(real.sub('E', x)) for x in line.split()[2:]] for line in lines], dtype=np.float64) return (exps, indxs, coeffs)
[docs]def merge_exponents(a, b): ''' Concatenate the arrays `a` and `b` using only the unique items from both arrays Args: a : numpy.array b : numpy.array Returns: res : 3-tuple of numpy arrays - res[0] sorted union of `a` and `b` - res[1] indices of `a` items in res[0] - res[2] indices of `b` items in res[0] ''' u = np.union1d(a, b)[::-1] idxa = np.where(np.in1d(u, a))[0] idxb = np.where(np.in1d(u, b))[0] return u, idxa, idxb
[docs]class NumpyEncoder(json.JSONEncoder):
[docs] def default(self, obj): ''' If input object is an ndarray it will be converted into a dict holding the data and dtype. ''' if isinstance(obj, np.ndarray): dtype = 'CFDTYPE' if obj.dtype == CFDTYPE else str(obj.dtype) return dict(data=obj.tolist(), dtype=dtype) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj)