diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c520141..03be7698 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # dev +# Version 0.6.0 (January 2019) + - Added ML-MIN algorithm for energy minimization. - Added ML-NEB algorithm for transition state search. - Changed input format for kernels in the GP. diff --git a/catlearn/__init__.py b/catlearn/__init__.py index 1ff08cdd..906d362f 100644 --- a/catlearn/__init__.py +++ b/catlearn/__init__.py @@ -1 +1 @@ -__version__ = "0.6.0.dev3" +__version__ = "0.6.0" diff --git a/catlearn/api/ase_atoms_api.py b/catlearn/api/ase_atoms_api.py index 3ba076fb..ce2a4579 100644 --- a/catlearn/api/ase_atoms_api.py +++ b/catlearn/api/ase_atoms_api.py @@ -27,7 +27,7 @@ def database_to_list(fname, selection=None): atoms.info['id'] = dbid atoms.info['ctime'] = float(d.ctime) atoms.subsets = {} - if 'data' in d and 'connectivity' in d.data: + if hasattr(d, 'data') and 'connectivity' in dict(d.data): atoms.connectivity = np.array(d.data.connectivity) images.append(atoms) diff --git a/catlearn/featurize/setup.py b/catlearn/featurize/setup.py index 4df8e2c2..cada1e82 100644 --- a/catlearn/featurize/setup.py +++ b/catlearn/featurize/setup.py @@ -9,6 +9,7 @@ from collections import defaultdict import multiprocessing from tqdm import tqdm +from catlearn.fingerprint.molecule import AutoCorrelationFingerprintGenerator from catlearn.fingerprint.adsorbate import (AdsorbateFingerprintGenerator, default_adsorbate_fingerprinters) from catlearn.fingerprint.convoluted import (ConvolutedFingerprintGenerator, @@ -59,7 +60,8 @@ class FeatureGenerator( AdsorbateFingerprintGenerator, ParticleFingerprintGenerator, StandardFingerprintGenerator, GraphFingerprintGenerator, BulkFingerprintGenerator, ConvolutedFingerprintGenerator, - ChalcogenideFingerprintGenerator, CatappFingerprintGenerator): + ChalcogenideFingerprintGenerator, CatappFingerprintGenerator, + AutoCorrelationFingerprintGenerator): """Feature generator class. It is sometimes necessary to normalize the length of feature vectors when @@ -303,6 +305,32 @@ def _get_atom_types(self, train_candidates, test_candidates=None): self.atom_types = atom_types + def _get_ads_atom_types(self, train_candidates, test_candidates=None): + """Function to get all potential atomic types in data. + + Parameters + ---------- + train_candidates : list + List of atoms objects. + test_candidates : list + List of atoms objects. + + Returns + ------- + atom_types : list + Full list of atomic numbers in adsorbate atoms subsets. + """ + train_candidates = list(train_candidates) + if test_candidates is not None: + train_candidates += list(test_candidates) + ads_atom_types = set() + for a in train_candidates: + ads_atom_types.update( + set(a.get_atomic_numbers()[a.subsets['ads_atoms']])) + ads_atom_types = sorted(list(ads_atom_types)) + + self.ads_atom_types = ads_atom_types + def _get_atom_length(self, train_candidates, test_candidates=None): """Function to get all potential system sizes in data. diff --git a/catlearn/fingerprint/adsorbate.py b/catlearn/fingerprint/adsorbate.py index c901830b..802da185 100644 --- a/catlearn/fingerprint/adsorbate.py +++ b/catlearn/fingerprint/adsorbate.py @@ -1,17 +1,14 @@ """Slab adsorbate fingerprint functions for machine learning.""" import numpy as np -import collections from ase.symbols import string2symbols from ase.data import ground_state_magnetic_moments as gs_magmom from ase.data import atomic_numbers, chemical_symbols -from catlearn.featurize.periodic_table_data import (get_mendeleev_params, - n_outer, - list_mendeleev_params, +from catlearn.featurize.periodic_table_data import (list_mendeleev_params, default_params, get_radius, electronegativities, - block2number, make_labels) + make_labels) from catlearn.featurize.base import BaseGenerator, check_labels @@ -32,8 +29,8 @@ 'generalized_cn', 'bag_cn', 'bag_atoms_ads', - 'bag_connections_ads', - 'bag_connections_chemi'] + 'bag_edges_ads', + 'bag_edges_chemi'] extra_slab_params = ['atomic_radius', 'heat_of_formation', @@ -645,7 +642,7 @@ def strain(self, atoms=None): strain_term = (av_term - av_bulk) / av_bulk return [strain_site, strain_term] - def bag_connections_ads(self, atoms): + def bag_edges_ads(self, atoms): """Returns bag of connections, counting only the bonds within the adsorbate. @@ -693,7 +690,7 @@ def bag_connections_ads(self, atoms): boc[bond_type] += 1 return list(boc[np.triu_indices_from(boc)]) - def bag_connections_chemi(self, atoms): + def bag_edges_chemi(self, atoms): """Returns bag of connections, counting only the bonds within the adsorbate and the connections between adsorbate and surface. @@ -742,6 +739,63 @@ def bag_connections_chemi(self, atoms): return list(boc[np.triu_indices_from(boc)]) + def bag_edges_all(self, atoms): + """Returns bag of connections, counting all bonds within the + adsorbate and between adsorbate atoms and surface. If we assign an + energy to each type of bond, considering first neighbors only, + this fingerprint would work independently in a linear model. The length + of the vector is atom_types * ads_atom_types. + + Parameters + ---------- + atoms : object + ASE Atoms object. + + Returns + ---------- + features : list + If None was passed, the elements are strings, naming the feature. + """ + # number of element types. + n_elements = len(self.atom_types) + n_elements_ads = len(self.ads_atom_types) + + # range of element types. + symbols = np.array([chemical_symbols[z] for z in self.atom_types]) + ads_symbols = np.array([chemical_symbols[z] for z + in self.ads_atom_types]) + + # Array of pairs. + rows, cols = np.meshgrid(symbols, ads_symbols) + + # Add pairs to make labels. + pairs = np.core.defchararray.add(rows, cols) + labels = ['bea_' + c + '_ads' for c in + pairs[np.triu_indices_from(pairs)]] + if atoms is None: + return labels + else: + # empty bag of connection types. + boc = np.zeros([n_elements_ads, n_elements]) + + natoms = len(atoms) + ads_atoms = atoms.subsets['ads_atoms'] + # n_ads_atoms = len(atoms.subsets['ads_atoms']) + cm = np.array(atoms.connectivity)[ads_atoms, :] + np.fill_diagonal(cm, 0) + + bonds = np.where(np.ravel(np.triu(cm)) > 0)[0] + for b in bonds: + # Get bonded atomic numbers. + z_ads, z_all = np.unravel_index(b, [natoms, natoms]) + bond_index = (atoms.numbers[ads_atoms][z_ads], + atoms.numbers[z_all]) + bond_type = tuple((self.ads_atom_types.index(bond_index[0]), + self.atom_types.index(bond_index[1]))) + # Count bonds in upper triangle. + boc[bond_type] += 1 + return list(boc[np.triu_indices_from(boc)]) + def en_difference_ads(self, atoms=None): """Returns a list of electronegativity metrics, squared and summed over bonds within the adsorbate atoms. diff --git a/catlearn/fingerprint/molecule.py b/catlearn/fingerprint/molecule.py index da333f33..23ece6ad 100644 --- a/catlearn/fingerprint/molecule.py +++ b/catlearn/fingerprint/molecule.py @@ -1,21 +1,20 @@ """Functions to build a gas phase molecule fingerprint.""" -from catlearn.utilities.neighborlist import catlearn_neighborlist +from catlearn.featurize.base import BaseGenerator from catlearn.featurize.periodic_table_data import list_mendeleev_params import networkx as nx import numpy as np -from ase import Atoms + default_parameters = [ 'atomic_number', 'covalent_radius_cordero', - 'en_pauling', -] + 'en_pauling'] -class AutoCorrelationFingerprintGenerator(): +class AutoCorrelationFingerprintGenerator(BaseGenerator): """Class for constructing an autocorrelation fingerprint.""" - def __init__(self, images, dstar=0, parameters=None): + def __init__(self, **kwargs): """Initialize. Parameters @@ -27,28 +26,24 @@ def __init__(self, images, dstar=0, parameters=None): parameters : list Parameters to use for the autocorrelation """ - if isinstance(images, Atoms): - images = [images] - - self.images = images - self.dstar = dstar + # Slab periodic table parameters. + if not hasattr(self, 'dstar'): + self.dstar = kwargs.get('dstar') - if parameters is None: - self.parameters = default_parameters + if self.dstar is None: + self.dstar = 2 - def generate(self): - """Return an (n, m) array of fingerprints.""" - fp_length = len(self.parameters) * (self.dstar + 1) - fingerprints = np.zeros((len(self.images), fp_length)) + if not hasattr(self, 'parameters'): + self.parameters = kwargs.get('parameters') - for i, atoms in enumerate(self.images): - fingerprints[i] = self.get_autocorrelation(atoms) + if self.parameters is None: + self.parameters = default_parameters - return fingerprints + super(AutoCorrelationFingerprintGenerator, self).__init__(**kwargs) def get_autocorrelation(self, atoms): """Return the autocorrelation fingerprint for a molecule.""" - connectivity = catlearn_neighborlist(atoms) + connectivity = atoms.connectivity G = nx.Graph(connectivity) distance_matrix = nx.floyd_warshall_numpy(G) diff --git a/catlearn/fingerprint/standard.py b/catlearn/fingerprint/standard.py index 25495ec9..d07fc6e4 100644 --- a/catlearn/fingerprint/standard.py +++ b/catlearn/fingerprint/standard.py @@ -24,7 +24,9 @@ 'eigenspectrum_vec', 'composition_vec', 'distance_vec', - 'bag_connections' + 'bag_elements' + 'bag_edges', + 'bag_element_cn' ] @@ -61,6 +63,17 @@ def __init__(self, **kwargs): '/catlearn/data/proxy-mendeleev.json') as f: self.element_data = json.load(f) + # Coordination number bounds. + if not hasattr(self, 'cn_max'): + self.cn_max = kwargs.get('cn_max') + if self.cn_max is None: + self.cn_max = 4 + + if not hasattr(self, 'cn_min'): + self.cn_min = kwargs.get('cn_min') + if self.cn_min is None: + self.cn_min = 0 + super(StandardFingerprintGenerator, self).__init__(**kwargs) def composition_vec(self, data): @@ -276,7 +289,32 @@ def distance_vec(self, data): features.append(co) return features - def bag_connections(self, atoms): + def bag_elements(self, atoms): + """Returns the bag of elements, defined as counting occurence of + elements in a given structure. + This is mostly useful for subtracting atomization energies. + + Parameters + ---------- + atoms : object + + Returns + ---------- + features : list + """ + # range of element types. + labels = ['bag_' + chemical_symbols[z] for z in self.atom_types] + if atoms is None: + return labels + else: + # empty bag atoms. + bag = np.zeros(len(labels)) + for i, z in enumerate(self.atom_types): + bag[i] += list(atoms.numbers).count(z) + + return list(bag) + + def bag_edges(self, atoms): """Returns the bag of connections, defined as counting connections between types of elements pairs. We define the bag as a vector, e.g. return [Number of C-H connections, # C-C, # C-O, ..., # M-X] @@ -291,11 +329,11 @@ def bag_connections(self, atoms): """ # range of element types n_elements = len(self.atom_types) - symbols = np.array([chemical_symbols[z] for z in self.atom_types]) - rows, cols = np.meshgrid(symbols, symbols) - pairs = np.core.defchararray.add(rows, cols) - labels = ['bag_' + c for c in pairs[np.triu_indices_from(pairs)]] if atoms is None: + symbols = np.array([chemical_symbols[z] for z in self.atom_types]) + rows, cols = np.meshgrid(symbols, symbols) + pairs = np.core.defchararray.add(rows, cols) + labels = ['bag_' + c for c in pairs[np.triu_indices_from(pairs)]] return labels else: # empty bag of bond types. @@ -315,4 +353,95 @@ def bag_connections(self, atoms): self.atom_types.index(bond_index[1]))) # Count bonds in upper triangle. boc[bond_type] += 1 - return list(boc[np.triu_indices_from(boc)]) + return boc[np.triu_indices_from(boc)].tolist() + + def bag_element_cn(self, atoms): + """Bag elements folded with coordination numbers, + e.g. number of C with CN = 4, number of C with CN = 3, ect. + + Parameters + ---------- + atoms : object + ASE Atoms object. + + Returns + ---------- + features : list + If None was passed, the elements are strings, naming the feature. + """ + labels = [] + atom_symbols = [chemical_symbols[z] for z in self.atom_types] + index_symbols = {} + for j, s in enumerate(atom_symbols): + index_symbols[s] = j + labels += ['bag_cn_' + s + '_' + str(n) for + n in range(self.cn_min, self.cn_max+1)] + if atoms is None: + return labels + else: + s_cn_matrix = np.zeros([len(self.atom_types), + self.cn_max+1-self.cn_min]) + cm = np.array(atoms.connectivity, dtype=int) + for i, atom in enumerate(atoms): + cn = cm[i, :].sum() + cn_i = cn - self.cn_min + if cn > self.cn_max or cn < self.cn_min: + print(atoms.info['key_value_pairs'], cn) + warnings.warn('Coordination number out of bounds') + return [np.nan] * len(labels) + s_cn_matrix[index_symbols[atoms.symbols[i]], cn_i] += 1 + fingerprint = list(np.ravel(s_cn_matrix)) + return fingerprint + + def bag_edges_cn(self, atoms): + """Returns the bag of connections folded with coordination numbers of + the node atoms. + + Parameters + ---------- + atoms : object + + Returns + ---------- + features : list + """ + # range of element types + atom_symbols = [chemical_symbols[z] for z in self.atom_types] + nodes = [] + for j, s in enumerate(atom_symbols): + nodes += [s + str(n) for n in + range(self.cn_min, self.cn_max+1)] + if atoms is None: + rows, cols = np.meshgrid(nodes, nodes) + pairs = np.core.defchararray.add(rows, cols) + labels = ['bag_' + c for c in pairs[np.triu_indices_from(pairs)]] + return labels + else: + # empty bag of bond types. + n_elements_cn = len(self.atom_types) * \ + (self.cn_max - self.cn_min + 1) + boc = np.zeros([n_elements_cn, n_elements_cn], dtype=int) + + natoms = len(atoms) + cm = np.array(atoms.connectivity, dtype=int) + np.fill_diagonal(cm, 0) + cn_list = cm.sum(axis=1) + + bonds = np.where(np.ravel(np.triu(cm)) > 0)[0] + for b in bonds: + # Get bonded atomic indices. + i_row, i_col = np.unravel_index(b, [natoms, natoms]) + z = (atoms.numbers[i_row], atoms.numbers[i_col]) + cn = (cn_list[i_row], cn_list[i_col]) + bond_index = np.lexsort((cn, z)) + node_a = chemical_symbols[np.array(z)[bond_index[0]]] + \ + str(np.array(cn)[bond_index[0]]) + node_b = chemical_symbols[np.array(z)[bond_index[1]]] + \ + str(np.array(cn)[bond_index[1]]) + + # Get bond types. + bond_type = tuple((nodes.index(node_a), + nodes.index(node_b))) + # Count bonds in upper triangle. + boc[bond_type] += 1 + return boc[np.triu_indices_from(boc)].tolist() diff --git a/catlearn/utilities/neighborlist.py b/catlearn/utilities/neighborlist.py index dd2cccc9..ebcd40b5 100644 --- a/catlearn/utilities/neighborlist.py +++ b/catlearn/utilities/neighborlist.py @@ -164,22 +164,9 @@ def ase_connectivity(atoms, cutoffs=None, count_bonds=True): if hasattr(atoms, 'neighborlist'): nl = atoms.neighborlist else: - nl = ase_neighborlist(atoms, cutoffs=cutoffs) - - conn_mat = [] - index = range(len(atoms)) - # Create binary matrix denoting connections. - for index1 in index: - conn_x = [] - for index2 in index: - if index2 in nl[index1]: - if count_bonds: - bonds = nl[index1].count(index2) - else: - bonds = 1 - conn_x.append(bonds) - else: - conn_x.append(0.) - conn_mat.append(conn_x) + nl = NeighborList(cutoffs=cutoffs, bothways=True) + nl.update(atoms) + conn_mat = nl.get_connectivity_matrix(sparse=False) + np.fill_diagonal(conn_mat, 0) return np.asarray(conn_mat, dtype=int) diff --git a/docs/changelog.md b/docs/changelog.md index 0f3ca3de..9f68eb3f 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,9 +1,12 @@ # Changelog +# Version 0.6.0 (January 2019) + +- Added ML-MIN algorithm for energy minimization. - Added ML-NEB algorithm for transition state search. - Changed input format for kernels in the GP. -# Version 0.5.0 (August 2018) +# Version 0.5.0 (October 2018) - Restructure of fingerprint module - Pandas DataFrame getter in FeatureGenerator diff --git a/setup.py b/setup.py index c45483dc..f2c33957 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def parse_requirements(filename): setuptools.setup( name="CatLearn", - version="0.6.0.dev3", + version="0.6.0", url="https://github.com/SUNCAT-Center/CatLearn", author="Paul C. Jennings", diff --git a/test/test_ads_fp_gen.py b/test/test_ads_fp_gen.py index f154bf79..13f3ac92 100644 --- a/test/test_ads_fp_gen.py +++ b/test/test_ads_fp_gen.py @@ -55,6 +55,7 @@ def test_tags(self): gen = FeatureGenerator(nprocs=1) train_fpv = default_fingerprinters(gen, 'adsorbates') train_fpv += [gen.formal_charges, + gen.bag_edges_ads, gen.ads_av, gen.ads_sum] matrix = gen.return_vec(images, train_fpv) diff --git a/test/test_autocorrelation.py b/test/test_autocorrelation.py index 6fb4f11a..0a9cdc64 100644 --- a/test/test_autocorrelation.py +++ b/test/test_autocorrelation.py @@ -1,11 +1,13 @@ -from catlearn.fingerprint.molecule import \ - AutoCorrelationFingerprintGenerator as ACG +from catlearn.featurize.setup import FeatureGenerator +from catlearn.utilities.neighborlist import ase_connectivity from ase.build import molecule +from ase.data import covalent_radii import numpy as np import unittest -truth = np.array([[166.0, 15963.0, 39.849700000000006, - 232.0, 32416.0, 72.664, 178.0, 22910.0, 78.7552]]) + +truth = np.array([[166, 15963, 39.8497, 220, 27890, + 61.444, 172, 21422, 65.1592]]) class TestAutoCorrelation(unittest.TestCase): @@ -14,8 +16,12 @@ class TestAutoCorrelation(unittest.TestCase): def test_generator(self): """Test the feature generation.""" atoms = molecule('HCOOH') - gen = ACG(atoms, dstar=2) - features = gen.generate() + atoms.center(vacuum=5) + radii = [covalent_radii[z] for z in atoms.numbers] + atoms.connectivity = ase_connectivity(atoms, radii) + images = [atoms] + gen = FeatureGenerator() + features = gen.return_vec(images, [gen.get_autocorrelation]) np.testing.assert_allclose(features, truth) diff --git a/test/test_bulk_fp_gen.py b/test/test_bulk_fp_gen.py index f6a6ea2b..f39032ea 100644 --- a/test/test_bulk_fp_gen.py +++ b/test/test_bulk_fp_gen.py @@ -36,7 +36,7 @@ def test_bulk_fp_gen(self): images = images_connectivity(images) gen = FeatureGenerator() - train_fpv = default_fingerprinters(gen, 'bulk') + [gen.bag_connections] + train_fpv = default_fingerprinters(gen, 'bulk') + [gen.bag_edges] matrix = gen.return_vec(images, train_fpv) labels = gen.return_names(train_fpv) print(np.shape(matrix), print(type(matrix)))