# -*- coding: utf-8 -*-
"""
Created on Tue Dec 19 11:05:24 2017
@author: lansf
"""
from __future__ import absolute_import, division, print_function
import os
import numpy as np
import json
import pkg_resources
from sklearn.cluster import KMeans
from imblearn.over_sampling import RandomOverSampler
[docs]def get_default_data_paths(adsorbate):
""" Get default paths to primary data of frequencies and intensities.
Parameters
----------
adsorbate : str
Name of the adsorbate for which to get the default primary DFT data.
Data is already provided for 'CO', 'NO', and 'C2H4'.
Returns
-------
nanoparticle_path : str
File path where nanoparticle or single adsorbate data will be saved.
isotope_path : str
File path where isotope data for CO and NO will be saved.
high_coverage_path : str
File path where high coverage data for CO will be saved.
coverage_scaling_path : str
File path where coverage scaling coefficients will be saved.
Notes
-----
Returns default frequencies to project intensities onto as well as default
paths for locations of the pure and mixture spectroscopic data.
"""
data_path = pkg_resources.resource_filename(__name__, 'data/')
nanoparticle_path = os.path.join(data_path, 'dft_nanoparticle/single_'+adsorbate+'.json')
isotope_path = os.path.join(data_path, 'dft_surface/isotope_'+adsorbate+'.json')
high_coverage_path = os.path.join(data_path, 'dft_surface/high_coverage_'+adsorbate+'.json')
coverage_scaling_path = os.path.join(data_path,'coverage_scaling_params_'+adsorbate+'.json')
return (nanoparticle_path, isotope_path, high_coverage_path\
, coverage_scaling_path)
[docs]def get_exp_data_path():
""" Get default paths to experimental data.
Returns
-------
experimental_data : list
list of paths to experimental data
"""
data_path = pkg_resources.resource_filename(__name__, 'data/')
experimental_path = os.path.join(data_path, 'experimental')
experimental_data = [os.path.join(experimental_path,file) for file \
in os.listdir(experimental_path)]
return experimental_data
[docs]class IR_GEN:
"""Class for generating complex synthetic IR spectra"""
def __init__(self, ADSORBATE='CO', INCLUDED_BINDING_TYPES=[1,2,3,4], TARGET='binding_type', NUM_TARGETS=None\
, nanoparticle_path=None, high_coverage_path=None, coverage_scaling_path=None,VERBOSE=False):
"""
Parameters
----------
ADSORBATE : str
Adsorbate for which the spectra is to be generated.
INCLUDED_BINDING_TYPES : list
Binding types whose frequencies/intensiteis from the primary data
set will be included in generating the complex spectra.
TARGET : str
Geometric descriptor for which the target histogram is to be gnerated.
Can be binding_type, GCN, or combine_hollow_sites. If it is
combine_hollow_sites then 3-fold and 4-fold sites are grouped together.
NUM_TARGETS : int
Number of GCN groups to predict. Only used if TARGET='GCN' and GCN
must be discretized.
nanoparticle_path : str
File path where nanoparticle or single adsorbate json data is saved.
high_coverage_path : str
File path where high coverage data for CO is saved saved.
coverage_scaling_path : str
File path where coverage scaling coefficients are saved.
VERBOSE : bool
Controls the printing of status statements.
Attributes
----------
BINDING_TYPES_with_4fold : list
List of all binding types.
TARGET : str
Set during initialization.
NUM_TARGETS: int
Set during initialization.
X0cov : numpy.ndarray
Set of frequencies and intensities at low coverage.
BINDING_TYPES : list
Binding types to be predicted, accounts for both filtering and
merging of certain sites.
GCNList : list
GCN values for the data points
NANO_PATH : int
Set during initialization.
HIGH_COV_PATH : int
Set during initialization.
COV_SCALE_PATH : int
Set during initialization.
ADSORBATE : float
Set during initialization.
INCLUDED_BINDING_TYPES : int
Set during initialization.
VERBOSE : int
Set during initialization.
"""
assert TARGET in ['binding_type','GCN','combine_hollow_sites'], "incorrect TARGET given"
assert type(INCLUDED_BINDING_TYPES) in [list,tuple,np.ndarray], "Included Binding Types should be a list"
#number of target variables.
nano_path, isotope_path, high_cov_path\
, cov_scale_path = get_default_data_paths(ADSORBATE)
if nanoparticle_path is None:
nanoparticle_path = nano_path
if ADSORBATE in ['NO','CO']:
max_coordination=4
if coverage_scaling_path is None:
coverage_scaling_path = cov_scale_path
if ADSORBATE == 'CO':
if high_coverage_path is None:
high_coverage_path = high_cov_path
elif ADSORBATE in ['C2H4']:
max_coordination = 2
with open(nanoparticle_path, 'r') as infile:
nanoparticle_data = json.load(infile)
Xfreq_ALL = np.array(nanoparticle_data['FREQUENCIES'], dtype='float')
is_local_minima = np.min(Xfreq_ALL, axis=1) > 0
BINDING_TYPES_unfiltered = np.array(nanoparticle_data['CN_ADSORBATE'])
max_forces = np.array(nanoparticle_data['MAX_FORCE'])
is_adsorbed = BINDING_TYPES_unfiltered >0
if TARGET in ['binding_type','combine_hollow_sites']:
NUM_TARGETS = len(INCLUDED_BINDING_TYPES)
correct_coordination = BINDING_TYPES_unfiltered <= max_coordination
small_force = max_forces < 0.05
select_files = np.all((is_local_minima,is_adsorbed,small_force,correct_coordination),axis=0)
for key in nanoparticle_data.keys():
nanoparticle_data[key] = np.array(nanoparticle_data[key])[select_files]
nanoparticle_data['INTENSITIES'][nanoparticle_data['FREQUENCIES'] == 0] = 0
BINDING_TYPES_with_4fold = nanoparticle_data['CN_ADSORBATE']
if TARGET == 'combine_hollow_sites':
nanoparticle_data['CN_ADSORBATE'][nanoparticle_data['CN_ADSORBATE'] == 4] = 3
NUM_TARGETS -= 1
if VERBOSE == True:
print('grouping hollow sites')
self.BINDING_TYPES_with_4fold = BINDING_TYPES_with_4fold
self.TARGET = TARGET
self.NUM_TARGETS = NUM_TARGETS
self.X0cov = np.array([(nanoparticle_data['FREQUENCIES'][i], nanoparticle_data['INTENSITIES'][i])
for i in range(len(nanoparticle_data['FREQUENCIES']))])
self.BINDING_TYPES = nanoparticle_data['CN_ADSORBATE']
self.GCNList = nanoparticle_data['GCN']
self.NANO_PATH = nanoparticle_path
self.HIGH_COV_PATH = high_coverage_path
self.COV_SCALE_PATH = coverage_scaling_path
self.ADSORBATE = ADSORBATE
self.INCLUDED_BINDING_TYPES = INCLUDED_BINDING_TYPES
self.VERBOSE = VERBOSE
[docs] def set_GCNlabels(self, Minimum=0, BINDING_TYPE_FOR_GCN=[1], showfigures=False, figure_directory='show'):
""" Cluster GCN values into groups/classes using k-means clustering.
Parameters
----------
Minimum : int
Minimum number of datapoints in each cluster. If a generated cluster
has fewer than this number of datapoints it is merged with the next
cluster. If the last cluster has fewer than the minimum number of
datapoints it is merged with the previous cluster.
showfigures : bool
Whether or not to generate figures visualizing the clusters and
their location in GCN-space.
figure_directory : str
Either a directory where figures are to be saved or the string 'show'
which indicates that the figure is supposed to be sent to gui output.
BINDING_TYPE_FOR_GCN : list
List of binding types whose GCN values are to be included in clustering.
Binding-types included will have a GCN label of 1 through the number
of clusters. Binding-types not included will be assigned a GCN label
of zero.
Attributes
----------
GCNlabels : numpy.ndarray
GCN label assigned to each primary datapoint.
NUM_TARGET : int
Updated number of targets. If n clusters (after merging) generated by the
K-means algorithm had less than the minimum number of clusters than
NUM_TARGET originally instantiated by the class is reduced by n.
Notes
-----
Assigns each primary datapoint a GCN label based on the GCN value using
k-means clustering where the number of target clusters is equal to the
number of targets instantiated with the class. This is required to be
run if one wishes to learn a distribution of GCN sites as GCN is
continuous. K-means clustering is an partially - supervised learning
technique that generates clusters/groups that are relatively evenly
spaced with roughly the same number of datapoints in each cluster.
"""
VERBOSE = self.VERBOSE
if VERBOSE == True:
print('Initial number of targets: '+str(self.NUM_TARGETS))
ADSORBATE = self.ADSORBATE
NUM_TARGETS = self.NUM_TARGETS
GCNList = self.GCNList
BINDING_TYPES = self.BINDING_TYPES
GCNList_selected = GCNList[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)]
KC = KMeans(n_clusters=NUM_TARGETS, random_state=0).fit(GCNList_selected.reshape(-1, 1))
KC_new = np.zeros(NUM_TARGETS, dtype='int')
KC_new[0] = KC.labels_[np.argmin(GCNList_selected)]
for i in range(NUM_TARGETS-1):
KC_new[i+1] = KC.labels_[np.isin(KC.labels_, KC_new[0:i+1]) == False]\
[np.argmin(GCNList_selected[np.isin(KC.labels_, KC_new[0:i+1]) == False])]
KC2class = dict(zip(KC_new, np.arange(1, len(KC_new)+1, dtype='int')))
KCclass = np.array([KC2class[i] for i in KC.labels_])
for i in range(NUM_TARGETS-1)[::-1]:
NUM_IN_CLASS = len(KCclass[KCclass == i+2])
if NUM_IN_CLASS < Minimum:
for ii in range(NUM_TARGETS)[::-1]:
if KC2class[ii] == i+2:
KC2class.update({ii:i+1})
KCclass = np.array([KC2class[ii] for ii in KC.labels_])
NUM_IN_CLASS = len(KCclass[KCclass == 1])
if NUM_IN_CLASS < Minimum:
for i in range(NUM_TARGETS):
if KC2class[i] > 1:
KC2class.update({i:KC2class[i]-1})
NUM_IN_CLASS = len(KCclass[KCclass == 2])
if NUM_IN_CLASS < Minimum:
for i in range(NUM_TARGETS):
if KC2class[i] > 2:
KC2class.update({i:KC2class[i]-1})
KCclass = np.array([KC2class[i] for i in KC.labels_])
NUM_TARGETS = len(set(KCclass))
GCNlabels = np.zeros(len(GCNList), dtype='int')
GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)] = KCclass
BreakPoints = np.linspace(0, 8.5, num=810)
BreakLabels = KC.predict(BreakPoints.reshape(-1, 1))
BreakLabels = [KC2class[i] for i in BreakLabels]
BreakMin = np.array([np.min(BreakPoints[BreakLabels == i])
for i in np.arange(1, NUM_TARGETS+1)])
BreakMax = np.array([np.max(BreakPoints[BreakLabels == i])
for i in np.arange(1, NUM_TARGETS+1)])
if showfigures == True:
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(GCNList_selected, GCNList_selected, c=KC.labels_)
for i in BreakMax:
plt.plot([0, i, 2*i], [2*i, i, 0], 'k-')
plt.xlabel('GCN')
plt.ylabel('GCN')
plt.xlim([0, 8.5])
plt.ylim([0, 8.5])
BreakString = zip(np.around(BreakMin, decimals=1), np.around(BreakMax, decimals=1))
BreakString = [str(count+1)+': '+str(i[0])+'-'+str(i[1])
for count, i in enumerate(BreakString)]
plt.figure()
#ax = plt.subplot()
plt.hist(GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)], bins=np.arange(0.5, NUM_TARGETS+1.5), rwidth=0.5)
plt.xticks(range(1, NUM_TARGETS+1))
#ax.set_xticklabels(greater_than)
plt.xlabel('GCN Group')
plt.ylabel('DFT Samples')
plt.show()
print(BreakString)
from matplotlib import rcParams
rcParams['lines.linewidth'] = 2
rcParams['lines.markersize'] = 5
params = {'figure.autolayout': True}
rcParams.update(params)
if figure_directory == 'show':
plt.figure()
else:
plt.figure(0, figsize=(3.5, 3), dpi=400)
#ax = plt.subplot()
plt.hist(GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)], bins=np.arange(0.5, NUM_TARGETS+1.5), rwidth=0.5)
plt.xticks(range(1, NUM_TARGETS+1))
#ax.set_xticklabels(greater_than)
plt.xlabel('GCN Group')
plt.ylabel('DFT Samples')
if figure_directory == 'show':
plt.show()
else:
plt.savefig(os.path.join(figure_directory,ADSORBATE+'GCN_Clustering.jpg'), format='jpg')
plt.close()
self.GCNlabels = GCNlabels
self.NUM_TARGETS = NUM_TARGETS
if VERBOSE == True:
print('Final number of targets: '+str(self.NUM_TARGETS))
[docs] def _get_probabilities(self, num_samples, NUM_TARGETS):
""" Get probablilities for sampling data from different classes/groups.
Parameters
----------
num_samples : int
number of complex spectra to generate
NUM_TARGETS : int
number of class/labes that are desired. Usually the number of
binding-types or the number of GCN labels > 0
Returns
-------
probabilities : numpy.ndarray
The probabilities to select each binding-type or GCN group.
Has dimensions (num_samples, NUM_TARGETS).
Notes
-----
Returns the a 2-D numpy.ndarray that is of length num_samples along the
first dimension and NUM_TARGETS alongt the second dimesion. Elements
correspond to the probability of selecting primary data point from a
a specific class/group such that the array sums to 1 along the 2nd dimention.
The probability assigned to the first index of the along the 2nd dimenstion
is comes from a uniform distribution between 0 and 1 while the probability
assigned to each following index $i = n$ comes from the uniform distribution
$1/\sum{p_{i}}$ where $i$ in $p_{i}$ comes from all previous index values $i < n$.
The probabilities are then shuffled along the second dimension. This probability
distribution results in contribution to spectra from any given class/label
is most likely zero and the likelihood of the contribution monotonically
decreases as the the contribution goes to 1. This distribution
ensures that all fractional contributions are as equally sampled as possible.
"""
#define np_shuffle to be locally in loop
probabilities = np.zeros((num_samples, NUM_TARGETS))
A = np.zeros(num_samples)
for i in range(NUM_TARGETS-1):
probs = (1-A)*np.random.random_sample(size=num_samples)
A = A+probs
probabilities[:, i] = probs
probabilities[:, -1] = 1 - np.sum(probabilities, axis=1)
#set any slightly negative probabilities to 0
probabilities[probabilities[...] < 2**-500] = 0
[np.random.shuffle(i) for i in probabilities]
return probabilities
[docs] def _perturb_spectra(self, perturbations, X, y, a=0.999, b=1.001, BINDING_TYPES=None):
""" Generate pertubations of the frequencies and intensities uniformly.
Parameters
----------
perturbations : int
The number of perturbed primary datapoints per original datapoint.
X : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
y : numpy.ndarray
The array of target variables to be regressed (binding types or GCN-labels)
a : int
The lower boudn of the uniform distribution.
b : int
The upper bound of the uniform distribution.
BINDING_TYPES : numpy.ndarray
Binding types of the primary datapoints.
Returns
-------
Xperturbed : numpy.ndarray
Perturbed intensities and frequencies. Has dimensions
$m_{new} x n x p$ where $m_{new} = m x perturbations$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
yperturbed : numpy.ndarray
Some y-variable to be regreesed such as binding-type or GCN group.
BINDING_TYPES_perturbed : numpy.ndarray
Binding-types of the perturbed primary datapoints.
Notes
-----
Returns a tuple of perturbed primary datapoints. The goal is to improve the
predictive range of the regressed model by expanding the datapoints and
accounting for error in the primary data (DFT). Spectra is pertubed according
to a uniform distribution between $a$ and $b$.
"""
X = X.copy(); y = y.copy()
perturbed_values = (b-a)*np.random.random_sample((X.shape[0]*perturbations
, X.shape[1], X.shape[2]))+a
Xperturbed = X.repeat(perturbations, axis=0)*perturbed_values
yperturbed = y.repeat(perturbations, axis=0)
if BINDING_TYPES is not None:
BINDING_TYPES = BINDING_TYPES.copy()
BINDING_TYPES_perturbed = BINDING_TYPES.repeat(perturbations,axis=0)
else:
BINDING_TYPES_perturbed = None
return (Xperturbed, yperturbed, BINDING_TYPES_perturbed)
[docs] def _mixed_lineshape(self, FWHM, fL, ENERGY_POINTS, energy_spacing):
""" Convolute spectra with Lorentzian or Gaussian induce broadening.
Parameters
----------
FWHM : float
Full-width-half-maximum of the desired spectra
fL : float
Fraction of spectra to be convoluted by a Lorentzian transform. The
remaining portion of the transoform to make up the FWHM comes sourced
from a Gaussian convoluting function.
ENERGY_POINTS : int
Number of energy points in the desired spectra
energy_spacing : float
spacing of the energy points
Returns
-------
transform : numpy.ndarray
transform that is convolved with a spectra that has a narrow line
width in order to produce spectra with greater line widths.
Notes
-----
Accepts a full-width-half-maximum, a fraction of Lorentzian, energy spacing
and number of points and produces the transform with which the spectra is
convolved through fourier convolution in order to produce realistic
experimental spectra.
"""
numpoints = 2*ENERGY_POINTS-1
#x is the x-axis of the transform whose values are spaced with energy_spacing,
#and centered at zero wuch that the $total points = 2 * ENERGY_POINTS + 1$
x = energy_spacing*(np.arange(numpoints, dtype='int')-int((numpoints-1)/2))
b = 0.5/np.sqrt(np.log(2))
L_prefactor = 4*2/(FWHM*np.pi)
G_prefactor = 4*1/(b*FWHM*np.sqrt(np.pi))
specL = 1.0/(1.0+4.0*(x/FWHM)**2)
specG = np.exp(-(x/(b*FWHM))**2)
transform = fL*L_prefactor*specL+(1-fL)*G_prefactor*specG
return transform
[docs] def _coverage_shift(self, X, BINDING_TYPES, SELF_COVERAGE, TOTAL_COVERAGE):
""" Shift frqequencies and intensities to account for coverages effects.
Parameters
----------
X : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
BINDING_TYPES : numpy.ndarray
Binding types of the primary datapoints.
SELF_COVERAGE : numpy.ndarray of floats
Relative spatial coverage of each binding-type
TOTAL_COVERAGE : numpy.ndarray of floats
Relative combined coverage fo which the primary data point "sits"
Returns
-------
Xcov : numpy.ndarray
3-D numpy array of frequencies and intensities that have been shifted
to account for coverage effects. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
"""
coverage_scaling_path = self.COV_SCALE_PATH
ones = np.ones(X.shape[0])
Xfrequencies = X[:, 0].copy()
Xintensities = X[:, 1].copy()
if type(SELF_COVERAGE) == int or type(SELF_COVERAGE) == float:
SELF_COVERAGE = ones*SELF_COVERAGE
if type(TOTAL_COVERAGE) == int or type(TOTAL_COVERAGE) == float:
TOTAL_COVERAGE = ones*TOTAL_COVERAGE
with open(coverage_scaling_path, 'r') as infile:
Coverage_Scaling = json.load(infile)
#maximum coverage obtained from Norton et al. 1982
ABS_COVERAGE = SELF_COVERAGE*0.096
TOTAL_COVERAGE_ABS = TOTAL_COVERAGE*0.096
#site-type: atop, bridge,threefold,fourfold
CO_STRETCH_IDX = np.argmax(Xfrequencies, axis=1)
#Copy frequencies and update CO stretch before updating all, then set CO stretch
CO_frequencies = Xfrequencies.copy()[np.arange(len(Xfrequencies)), CO_STRETCH_IDX]
CO_intensities = Xintensities.copy()[np.arange(len(Xintensities)), CO_STRETCH_IDX]
for i in range(4):
CO_frequencies[BINDING_TYPES == i+1] =\
(CO_frequencies[BINDING_TYPES == i+1]\
* (Coverage_Scaling['CO_FREQ'][i]['SELF_CO_PER_A2']\
* ABS_COVERAGE[BINDING_TYPES == i+1]\
+ Coverage_Scaling['CO_FREQ'][i]['CO_PER_A2']\
* TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1] + 1))
CO_intensities[BINDING_TYPES == i+1] = (CO_intensities[BINDING_TYPES == i+1]\
*np.exp(Coverage_Scaling['CO_INT_EXP']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1]))
Xfrequencies[BINDING_TYPES == i+1] = (Xfrequencies[BINDING_TYPES == i+1]\
*(Coverage_Scaling['PTCO_FREQ']['SELF_CO_PER_A2']*ABS_COVERAGE[BINDING_TYPES == i+1]\
+Coverage_Scaling['PTCO_FREQ']['CO_PER_A2']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1]\
+1).reshape((-1, 1)))
Xintensities[BINDING_TYPES == i+1] = (Xintensities[BINDING_TYPES == i+1]\
*np.exp(Coverage_Scaling['PTCO_INT_EXP']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1]).reshape((-1,1)))
Xfrequencies[np.arange(X.shape[0]), CO_STRETCH_IDX] = CO_frequencies
#Xfrequencies cannot be less than 0
Xfrequencies[Xfrequencies[...]<2**-500] = 0
Xintensities[np.arange(X.shape[0]), CO_STRETCH_IDX] = CO_intensities
Xcov = np.stack((Xfrequencies,Xintensities),axis=1)
return Xcov
[docs] def scaling_factor_shift(self, X):
""" Shift frequencies by a scaling factor to match experiment.
Parameters
----------
X : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
Returns
-------
X : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
Notes
-----
A scaling factor shift accounts for systematic errors in the functional
to over- or under-bind as well as systematic erros in the harmonic
approximation. Computed according to
https://cccbdb.nist.gov/vibnotes.asp
"""
Xfrequencies = X[:, 0].copy()
Xintensities = X[:, 1].copy()
CO_STRETCH_IDX = Xfrequencies > 1000
MC_IDX = Xfrequencies < 1000
#Scaling Factor determined from comparing experiment to DFT
#uncertainties are 0.00097 and 0.00182 respectively
SFCO = 1.0121
SFMC = 0.969
Xfrequencies[CO_STRETCH_IDX] = Xfrequencies[CO_STRETCH_IDX]*SFCO
Xfrequencies[MC_IDX] = Xfrequencies[MC_IDX]*SFMC
X = np.stack((Xfrequencies,Xintensities),axis=1)
return X
[docs] def _perturb_and_shift(self,perturbations, X, y, BINDING_TYPES=None):
""" Shift spectra with scaling factor that has gaussian error.
Parameters
----------
perturbations : int
The number of perturbed primary datapoints per original datapoint.
X : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities.
y : numpy.ndarray
The array of target variables to be regressed (binding types or GCN-labels)
BINDING_TYPES : numpy.ndarray
Binding types of the primary datapoints.
Returns
-------
Xperturbed : numpy.ndarray
Perturbed intensities and frequencies. Has dimensions
$m_{new} x n x p$ where $m_{new} = m x perturbations$ is the number of primary datapoints, $n = 2$ and
$p$ is the number of frequencies/intensities for each datapoint.
yperturbed : numpy.ndarray
Some y-variable to be regreesed such as binding-type or GCN group.
BINDING_TYPES_perturbed : numpy.ndarray
Binding-types of the perturbed primary datapoints.
Notes
-----
Returns a tuple of perturbed primary datapoints. The goal is to improve the
incorporate error in the scaling factor into the validation and test set.
Scaling factor is perturbed according to its calculated standard error.
"""
ADSORBATE = self.ADSORBATE
X = X.copy()
y = y.copy()
Xperturbed = X.repeat(perturbations, axis=0)
yperturbed = y.repeat(perturbations, axis=0)
if BINDING_TYPES is not None:
BINDING_TYPES = BINDING_TYPES.copy()
BINDING_TYPES_perturbed = BINDING_TYPES.repeat(perturbations,axis=0)
else:
BINDING_TYPES_perturbed = None
Xfrequencies = Xperturbed[:, 0].copy()
Xintensities = Xperturbed[:, 1].copy()
CO_STRETCH_IDX = Xfrequencies > 1000
MC_IDX = Xfrequencies < 1000
#Scaling Factor determined from comparing experiment to DFT
#uncertainties are 0.00097 and 0.00182 respectively
SFCO = 1.0121
SFMC = 0.969
if ADSORBATE == 'CO':
perturb_CO_freq = 0.00097044 * np.random.standard_normal((Xfrequencies[CO_STRETCH_IDX].shape)) + SFCO
perturb_MC_freq = 0.00183104 * np.random.standard_normal((Xfrequencies[MC_IDX].shape)) + SFMC
else:
perturb_CO_freq = 0.00097044 * np.random.standard_normal((Xfrequencies[CO_STRETCH_IDX].shape)) + 1
perturb_MC_freq = 0.00183104 * np.random.standard_normal((Xfrequencies[MC_IDX].shape)) + 1
perturb_CO_int = 0.00097044 * np.random.standard_normal((Xintensities[CO_STRETCH_IDX].shape)) + 1
perturb_MC_int = 0.00183104 * np.random.standard_normal((Xintensities[MC_IDX].shape)) + 1
Xfrequencies[CO_STRETCH_IDX] *= perturb_CO_freq
Xfrequencies[MC_IDX] *= perturb_MC_freq
Xintensities[CO_STRETCH_IDX] *= perturb_CO_int
Xintensities[MC_IDX] *= perturb_MC_int
Xperturbed_and_shifted = np.stack((Xfrequencies,Xintensities),axis=1)
Xperturbed_and_shifted[Xperturbed_and_shifted[...]<2**-500] = 0
return (Xperturbed_and_shifted, yperturbed, BINDING_TYPES_perturbed)
[docs] def _generate_spectra(self, Xfrequencies, Xintensities, ENERGIES):
""" Convert set of frequencies and intensities to spectra with minimal
line width by Gaussian convolution.
Parameters
----------
Xfrequencies : numpy.ndarray
2-D numpy array of frequencies. The ndarray has dimensions
$m x n$ where $m$ is the number of primary datapoints, $n$
is the number of frequencies/intensities for each datapoint.
Xintensities : numpy.ndarray
2-D numpy array of intensities. The ndarray has dimensions
$m x n$ where $m$ is the number of primary datapoints, $n$
is the number of frequencies/intensities for each datapoint.
ENERGIES : numpy.ndarray
Numpy array of energies onto which frequencies and itensities will
be convolved.
Returns
-------
intmesh : numpy.ndarray
numpy ndarray with dimensions $m x p$ where $m$ is the number of
primary datapoints and $p$ is the number of energy points onto which
the frequencies and intensities aree projected with Fourier convolution.
Notes
-----
This preconvolution is necessary for numerical reasons before data is
convoluted to induce greater spectral broadening.
"""
ENERGY_POINTS = len(ENERGIES)
energy_spacing = ENERGIES[1]-ENERGIES[0]
FWHM = 2*energy_spacing
sigma = FWHM/(2.0 * np.sqrt(2.0 * np.log(2.)))
prefactor = 1.0/(sigma * np.sqrt(2.0 * np.pi))
energies2D = ENERGIES.reshape((-1, 1))
int_mesh = np.zeros((Xfrequencies.shape[0], ENERGY_POINTS))
#mesh everything on energy grids with a FWHM of twice the energy spacing
for i in range(Xfrequencies.shape[0]):
freq2D = np.tile(Xfrequencies[i], (ENERGY_POINTS, 1))
int2D = np.tile(Xintensities[i], (ENERGY_POINTS, 1))
temp = int2D*prefactor*np.exp(-(freq2D-energies2D)**2/(2.0*sigma**2))
int_mesh[i] = np.sum(temp, axis=1)
return int_mesh
[docs] def _xyconv(self, X_sample, Y_sample, probabilities, BINDING_TYPES_sample):
""" Covolutes balananced sample of primary data and generates complex
mixed spectra.
Parameters
----------
perturbations : int
The number of perturbed primary datapoints per original datapoint.
X_sample : numpy.ndarray
3-D numpy array of frequencies and intensities. The ndarray has dimensions
$m x n x p$ where $m$ is the number of primary datapoints after data class balancing
, $n = 2$ and $p$ is the number of frequencies/intensities.
Y_sample : numpy.ndarray
The array of target variables to be regressed (binding types or GCN-labels)
probabilities : numpy.ndarray
The probabilities to select each binding-type or GCN group.
Has dimensions (num_samples, NUM_TARGETS).
BINDING_TYPES_sample : numpy.ndarray
Binding types of the primary datapoints after data class balancing.
Returns
-------
Xconv : numpy.ndarray
Convoluted complex spectra of dimensions $m x n$ where $m$ is the
desired number of samples and $n$ is the number of energy points.
yconv : numpy.ndarray
Fraction of occupied binding-types or GCN groups that contribute to
the total spectra. yconv has dimensions $m x p$ where $m$ is the
desired number of samples and $p$ is the the number of targets.
"""
get_probabilities = self._get_probabilities
_mixed_lineshape = self._mixed_lineshape
_coverage_shift = self._coverage_shift
_generate_spectra = self._generate_spectra
NUM_TARGETS = self.NUM_TARGETS
ENERGY_POINTS = self.ENERGY_POINTS
COVERAGE = self.COVERAGE
TARGET = self.TARGET
INCLUDED_BINDING_TYPES = self.INCLUDED_BINDING_TYPES
MAX_COVERAGES = self.MAX_COVERAGES
ENERGIES = self.ENERGIES
if TARGET == 'GCN':
GCNlabels = self.GCNlabels
energy_spacing = ENERGIES[1]-ENERGIES[0]
Xfrequencies = X_sample[:, 0].copy()
Xintensities = X_sample[:, 1].copy()
np.random.shuffle(probabilities)
num_samples = len(probabilities)
num_simple_spectra = np.random.randint(1, high=201, size=num_samples)
fLs = np.random.sample(num_samples)
FWHMs = np.random.uniform(low=2, high=75, size=num_samples)
Xconv = np.zeros((num_samples, ENERGY_POINTS))
yconv = np.zeros((num_samples, NUM_TARGETS))
y_mesh = np.zeros((Y_sample.size, NUM_TARGETS))
#subtracting MIN_Y ensures that the number of indices in
#ymin is equal to the labels, even if the labels don't start at 0
if TARGET == 'GCN':
MIN_Y = min(GCNlabels[GCNlabels>0])
else:
MIN_Y = min(INCLUDED_BINDING_TYPES)
y_mesh[np.arange(Y_sample.size), Y_sample-MIN_Y] = 1
sample_indices = np.arange(Y_sample.size)
parray = np.zeros(Y_sample.size)
shift_vector = np.zeros(Y_sample.size)
if TARGET == 'GCN' or COVERAGE == 'low' or type(COVERAGE) in [float,int]:
int_mesh = _generate_spectra(Xfrequencies, Xintensities,ENERGIES)
else:
coverage_parray = get_probabilities(num_samples, 11)
coverage_totals = np.random.random_sample(size=[num_samples,10])
for i in range(num_samples):
shift_vector[...]=0
non_zeros = np.random.randint(low=1, high=Y_sample.size)
indices_to_shift = np.random.choice(sample_indices,size=non_zeros,replace=False)
shift_vector[indices_to_shift] = np.random.random_sample(size=non_zeros)
for ii in range(NUM_TARGETS):
ii_MIN_Y = ii+MIN_Y
shift_vector_sum = shift_vector[Y_sample == ii_MIN_Y].sum()
if shift_vector_sum >0:
shift_vector[Y_sample == ii_MIN_Y] /= shift_vector_sum
parray[Y_sample == ii_MIN_Y] = probabilities[i, ii] * shift_vector[Y_sample==ii_MIN_Y]
else:
parray[Y_sample == ii+MIN_Y] = probabilities[i, ii]
parray /= np.sum(parray)
indices_primary = np.random.choice(sample_indices, size=num_simple_spectra[i], replace=True, p=parray)
if TARGET == 'GCN' or COVERAGE == 'low' or type(COVERAGE) in [float,int]:
combined_mesh = np.sum(int_mesh[indices_primary], axis=0)
else:
#Allow different coverages to exist on the surfaces.
#Allow the different binding types to have different coverages
#initialize coverages
SELF_COVERAGE = np.random.random_sample(num_simple_spectra[i])
TOTAL_COVERAGE = np.zeros_like(SELF_COVERAGE)
COVERAGE_INDICES = np.random.choice([0,1,2,3,4,5,6,7,8,9,10], size=num_simple_spectra[i],replace=True, p=coverage_parray[i])
#self coverage corresponding to index of 0 is island so it is single coverage and is skipped
for ii in sorted(set(COVERAGE_INDICES.tolist()+[0]))[1:]:
TOTAL_COVERAGE[COVERAGE_INDICES == ii] = coverage_totals[i][ii-1]
#Set coverage of each spectra to be total coverage divided by the number of spectra being combined
SELF_COVERAGE[COVERAGE_INDICES == ii ] = coverage_totals[i][ii-1]/TOTAL_COVERAGE[COVERAGE_INDICES == ii].size
#update self coverage of indentical binding types to be the same (their sum)
for iii in INCLUDED_BINDING_TYPES:
SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary]==iii),axis=0)] \
= np.sum(SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary]==iii),axis=0)])
#decrease maximum coverage at less favorable sites to improve prediction score
for count, max_coverage in enumerate(MAX_COVERAGES):
SELF_COVERAGE[BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]] *= max_coverage
#Ensure that Total coverage is compatible with self_coverage
for ii in sorted(set(COVERAGE_INDICES.tolist()+[0]))[1:]:
coverage_factor = 0
for count, max_coverage in enumerate(MAX_COVERAGES):
coverage_factor += max_coverage * \
SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]), axis=0)].size
TOTAL_COVERAGE[COVERAGE_INDICES == ii] *= coverage_factor/TOTAL_COVERAGE[COVERAGE_INDICES == ii].size
TOTAL_COVERAGE[COVERAGE_INDICES==0] = SELF_COVERAGE[COVERAGE_INDICES==0]
Xcov = _coverage_shift(X_sample[indices_primary], BINDING_TYPES_sample[indices_primary], SELF_COVERAGE, TOTAL_COVERAGE)
Xcovfrequencies = Xcov[:, 0].copy()
Xcovintensities = Xcov[:, 1].copy()
int_mesh = _generate_spectra(Xcovfrequencies,Xcovintensities,ENERGIES)
for count, max_coverage in enumerate(MAX_COVERAGES):
int_mesh[BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]] *= max_coverage
combined_mesh = np.sum(int_mesh, axis=0)
yconv[i] = np.sum(y_mesh[indices_primary], axis=0, dtype='int')
transform = _mixed_lineshape(FWHMs[i], fLs[i], ENERGY_POINTS, energy_spacing)
Xconv[i] = np.convolve(combined_mesh, transform, mode='valid')
#This part is to adjust the tabulated contribution to match that of the adjusted spectra for max coverage of bridge, 3-fold and 4-fold
if COVERAGE == 'high':
if TARGET == 'binding_type':
for count, max_coverage in enumerate(MAX_COVERAGES):
yconv[:,count] *= max_coverage
if TARGET == 'combine_hollow_sites':
for count, max_coverage in enumerate(MAX_COVERAGES[0:len(MAX_COVERAGES)-1]):
yconv[:,count] *= max_coverage
#10**-3 accounts for noise in experimental spectra
Xconv += 10**-3*np.max(Xconv, axis=1).reshape((-1, 1))*np.random.random_sample(Xconv.shape)
#normalize so max X is 1 and make y a set of fractions that sum to 1
Xconv /= np.max(Xconv, axis=1).reshape((-1, 1))
yconv /= np.sum(yconv, axis=1).reshape((-1, 1))
return (Xconv, yconv)
[docs] def add_noise(self, Xconv_main, Xconv_noise, noise2signalmax=0.67):
""" Adds two sets of complex spectra, multiplying one by a uniform
random variable between 0 and 0.67 which is treated as noise.
Parameters
----------
Xconv_main : numpy.ndarray
Convoluted complex spectra of dimensions $m x n$ where $m$ is the
desired number of samples and $n$ is the number of energy points.
Xconv_noise : numpy.ndarray
Convoluted complex spectra of dimensions $m x n$ where $m$ is the
desired number of samples and $n$ is the number of energy points.
This spectra is treated as noise.
Returns
-------
X_noisey : numpy.ndarray
Convoluted complex spectra of dimensions $m x n$ where $m$ is the
desired number of samples and $n$ is the number of energy points.
Data has noise from Xconv_noise.
"""
X_noisey = np.zeros((len(Xconv_main), len(Xconv_main[0])))
noise_value = np.random.uniform(low=0, high=noise2signalmax, size=len(Xconv_main))
noise_indices = np.arange(len(Xconv_noise))
noise_sample = np.random.choice(noise_indices, size=len(Xconv_main), replace=True)
for i in range(len(Xconv_main)):
X_noisey[i] = Xconv_main[i] + Xconv_noise[noise_sample[i]]*noise_value[i]
X_noisey = X_noisey/np.max(X_noisey, axis=1).reshape(-1, 1)
X_noisey[abs(X_noisey[...])<2**-500] = 0
return X_noisey
[docs] def _get_coverage_shifted_X(self, X0cov, Y, COVERAGE):
"""Get frequencies and intensities shifted with coveage.
Parameters
----------
X0cov : numpy.ndarray
Numpy array of size (n,2) where $n$ is the number of frequency
and intensity pairs.
Y : numpy.ndarray
Target classes/groups (binding-types or GCN labels)
COVERAGE : str or float
The coverage at which the synthetic spectra is generated. If high,
spectra at various coverages is generated.
Returns
-------
X : numpy.ndarray
Frequencies and intensities shifted to account for coverage
NUM_TARGETS : int
Number of target classes/groups. Updated if TARGET is GCN and coverage is 'high'
"""
_coverage_shift = self._coverage_shift
high_coverage_path = self.HIGH_COV_PATH
VERBOSE = self.VERBOSE
TARGET = self.TARGET
BINDING_TYPES_with_4fold = self.BINDING_TYPES_with_4fold
NUM_TARGETS = self.NUM_TARGETS
#Adding Data for Extended Surfaces
if COVERAGE == 'high' and TARGET == 'GCN':
#reset NUM_TARGETS if this function has already been run.
NUM_TARGETS = len(set(Y[Y>0]))
if VERBOSE == True:
print('Adding high-coverage low index planes')
print('Initial number of targets: '+str(NUM_TARGETS))
with open(high_coverage_path, 'r') as infile:
HC_Ext = json.load(infile)
HC_CNPt = np.sort(np.array(list(set([np.min(i) for i in HC_Ext['CN_METAL']]))))
NUM_TARGETS += len(HC_CNPt)
HC_frequencies = []
HC_intensities = []
HC_classes = []
max_freqs = np.max([len(i) for i in HC_Ext['FREQUENCIES']])
for counter, i in enumerate(HC_CNPt):
for ii in range(len(HC_Ext['CN_METAL'])):
if np.min(HC_Ext['CN_METAL'][ii]) == i:
HC_classes.append(np.max(Y)+counter+1)
num_atop = len(np.array(HC_Ext['CN_ADSORBATE'][ii]
)[np.array(HC_Ext['CN_ADSORBATE'][ii]) == 1])
offset = max_freqs-len(HC_Ext['FREQUENCIES'][ii])
HC_frequencies.append(np.pad(HC_Ext['FREQUENCIES'][ii], (0, offset)
, 'constant', constant_values=0))
HC_intensities.append(np.pad(HC_Ext['INTENSITIES'][ii], (0, offset)
, 'constant', constant_values=0)/num_atop)
HC_classes = np.array(HC_classes)
#Change all high-coverage classes to just max(y)+1
#until more accurate experiments and DFT are available
NUM_TARGETS = NUM_TARGETS - len(HC_CNPt) + 1
HC_classes[...] = NUM_TARGETS
#
HC_frequencies = np.array(HC_frequencies)
HC_intensities = np.array(HC_intensities)
HC_X = np.array([np.array((HC_frequencies[i], HC_intensities[i]))
for i in range(len(HC_frequencies))])
offset = max_freqs-len(X0cov[0][0])
X = np.pad(X0cov, ((0, 0), (0, 0), (0, offset)), 'constant', constant_values=0)
if VERBOSE == True:
print('Final number of targets: '+str(NUM_TARGETS))
elif COVERAGE == 'high' and TARGET in ['binding_type', 'combine_hollow_sites']:
if VERBOSE == True:
print('testing all coverages')
X = X0cov
elif type(COVERAGE) == int or type(COVERAGE) == float:
if VERBOSE == True:
print('Relative coverage is ' + str(COVERAGE))
X = _coverage_shift(X0cov, BINDING_TYPES_with_4fold, COVERAGE, COVERAGE)
elif COVERAGE == 'low':
X = X0cov
if COVERAGE == 'high' and TARGET == 'GCN':
self.HC_X = HC_X
self.HC_classes = HC_classes
return X, NUM_TARGETS
[docs] def _add_high_coverage_data(self, X, Y):
"""method for adding high coverage GCN data to set of spectra
Parameters
----------
X : numpy.ndarray
Zero coverage frequencies and intensities
Y : numpy.ndarray
GCN labels
Returns
-------
X_new : numpy.ndarray
Frequencies and intensities with high coveage data
Y_new : numpy.ndarray
New taret vector with high covearge GCN data added
"""
_perturb_spectra = self._perturb_spectra
HC_X = self.HC_X
HC_classes = self.HC_classes
X_new = X.copy()
Y_new = Y.copy()
HC_X_expanded, HC_classes_expanded, _ = _perturb_spectra(5, HC_X, HC_classes
, a=0.999, b=1.001)
X_new = np.concatenate((X_new, HC_X_expanded), axis=0)
Y_new = np.concatenate((Y_new, HC_classes_expanded), axis=0)
return (X_new, Y_new)
[docs] def _get_balanced_data(self, X_new, Y_new, indices):
"""Correct for imbalances in the data for improved learning
Parameters
----------
X_new : numpy.ndarray
Full set of frequencies and intensities
Y_new : numpy.ndarray
class/groups that are imbalanced
indices : list
Indices for which individual X_new and Y)new will be selected.
Returns
-------
X_balanced : numpy.ndarray
Frequencies and intensities corresponding to balanced Y
Y_balanced : numpy.ndarray
Balanced set of classes/groups.
BINDING_TYPES_balanced : numpy.ndarray
Balanced set of binding-types that correspond to Y_balanced
"""
TARGET = self.TARGET
BINDING_TYPES_with_4fold = self.BINDING_TYPES_with_4fold
COVERAGE = self.COVERAGE
if len(set(Y_new[indices])) >1:
indices_balanced, Y_balanced = RandomOverSampler().fit_sample(np.array(indices).reshape(-1,1),Y_new[indices].copy())
else:
indices_balanced = np.array(indices)
Y_balanced = Y_new[indices]
X_balanced = X_new[indices_balanced.flatten()]
#BINDING_TYPES_balanced used only if COVERAGE == 'high' and (TARGET == 'binding_type' or TARGET == 'combine_hollow_sites')
if COVERAGE == 'high' and TARGET in ['binding_type', 'combine_hollow_sites']:
BINDING_TYPES_balanced = BINDING_TYPES_with_4fold[indices_balanced.flatten()]
else:
BINDING_TYPES_balanced = None
return X_balanced, Y_balanced, BINDING_TYPES_balanced
[docs] def _scale_and_perturb(self, X_balanced, Y_balanced, BINDING_TYPES_balanced\
,IS_TRAINING_SET, TRAINING_ERROR):
"""Apply scaling factor and pertub by error associated with scaling factor.
Parameters
----------
X_balanced: numpy.ndarray
Full set of frequencies and intensities corresponding to balanced data
Y_balanced : numpy.ndarray
Balanced class/groups
BINDING_TYPES_balanced : list
Balaned bindig types that correspond to spectra and target data
IS_TRAINING_SET : bool
Indicates whether synthetic spectra to be generated is for training or validating
TRAINING_ERROR : float or str
Tag that indicates treatment of error in scaling factor for training data.
Returns
-------
X_sample : numpy.ndarray
Sample of frequencies and intensities that will be mixed and then convoluted.
Y_sample : numpy.ndarray
Targets classes that will be tabulated to generated fractional contributions to spectra
BINDING_TYPES_sample : numpy.ndarray
Binding-types that will be used in coverage scaling if scaling is set to 'high'
"""
_perturb_spectra = self._perturb_spectra
scaling_factor_shift = self.scaling_factor_shift
_perturb_and_shift = self._perturb_and_shift
if IS_TRAINING_SET == True and type(TRAINING_ERROR) in [float,int]:
X_balanced = scaling_factor_shift(X_balanced)
X_sample, Y_sample, BINDING_TYPES_sample = _perturb_spectra(\
5, X_balanced, Y_balanced, a=1-TRAINING_ERROR, b=1+TRAINING_ERROR, BINDING_TYPES=BINDING_TYPES_balanced)
elif IS_TRAINING_SET == False or TRAINING_ERROR == 'gaussian':
X_sample, Y_sample, BINDING_TYPES_sample = _perturb_and_shift(5, X_balanced, Y_balanced, BINDING_TYPES_balanced)
elif IS_TRAINING_SET == True and TRAINING_ERROR is None:
X_balanced = scaling_factor_shift(X_balanced)
X_sample = X_balanced
Y_sample = Y_balanced
BINDING_TYPES_sample = BINDING_TYPES_balanced
return X_sample, Y_sample, BINDING_TYPES_sample
[docs] def _get_sample_data(self, X, Y, indices, IS_TRAINING_SET, TRAINING_ERROR):
"""Apply scaling factor and pertub by error associated with scaling factor.
Parameters
----------
X : numpy.ndarray
Zero coverage frequencies and intensities
Y : numpy.ndarray
GCN labels
indices : list
Indices for which individual X_new and Y)new will be selected.
IS_TRAINING_SET : bool
Indicates whether synthetic spectra to be generated is for training or validating
TRAINING_ERROR : float or str
Tag that indicates treatment of error in scaling factor for training data.
Returns
-------
X_sample : numpy.ndarray
Sample of frequencies and intensities that will be mixed and then convoluted.
Y_sample : numpy.ndarray
Targets classes that will be tabulated to generated fractional contributions to spectra
BINDING_TYPES_sample : numpy.ndarray
Binding-types that will be used in coverage scaling if scaling is set to 'high'
"""
_scale_and_perturb = self._scale_and_perturb
_add_high_coverage_data = self._add_high_coverage_data
_get_balanced_data = self._get_balanced_data
TARGET = self.TARGET
COVERAGE = self.COVERAGE
#add high coverage GCN data and update indices if necessary
if COVERAGE == 'high' and TARGET == 'GCN':
X_new, Y_new = _add_high_coverage_data(X, Y)
indices = list(indices) + np.arange(Y_new.size)[Y.size:].tolist()
else:
X_new = X.copy()
Y_new = Y.copy()
#balance data based on imbalanced classes/groups
X_balanced, Y_balanced, BINDING_TYPES_balanced = _get_balanced_data(X_new, Y_new, indices)
#adding perturbations for improved fitting by account for frequency and intensity errors from DFT
X_sample, Y_sample, BINDING_TYPES_sample = _scale_and_perturb(X_balanced, Y_balanced, BINDING_TYPES_balanced\
,IS_TRAINING_SET, TRAINING_ERROR)
return X_sample, Y_sample, BINDING_TYPES_sample
[docs] def set_spectra_properties(self, COVERAGE=None, MAX_COVERAGES = [1,1,1,1]\
, LOW_FREQUENCY=200, HIGH_FREQUENCY=2200, ENERGY_POINTS=501):
"""Set spectra specific properties
Parameters
----------
COVERAGE : str or float
The coverage at which the synthetic spectra is generated. If high,
spectra at various coverages is generated.
MAX_COVERAGES : list
Maximum coverages allowed for each binding-type if COVERAGE
is set to 'high'
LOW_FREQUENCY : float
The lowest frequency for which synthetic spectra is generated
HIGH_FREQUENCY : float
The high frequency for which synthetic spectra is generated
ENERGY_POINTS : int
The number of points the synthetic spectra is discretized into
Attributes
----------
X : numpy.ndarray
Coverage shifted frequencies and intensities
Y : The target variables. Either binding-type or GCN label
NUM_TARGETS : int
The number of binding-types or GCN labels. This is set by the user
and can be altered by set_gcn_labels() and _get_coverage_shifted_X
X0cov : numpy.ndarray
The zero coverage frequency and intensity pairs
COVERAGE : str or float
The COVERAGE set by the user.
LOW_FREQUENCY : float
The low frequency set by the user.
HIGH_FREQUENCY : float
The high frequency set by the user
ENERGY_POINTS : int
The number of energy points set by the user.
MAX_COVERAGES : list
List of maximum coverages set by the user.
Notes
-----
This function calls _get_coverage_shifted_X in order to shift X frequencies
according to the specified coverage.
"""
assert type(COVERAGE) == float or COVERAGE==1 or COVERAGE \
in ['low', 'high'], "Coverage should be a float, 'low', or 'high'."
_get_coverage_shifted_X = self._get_coverage_shifted_X
TARGET = self.TARGET
BINDING_TYPES = self.BINDING_TYPES
X0cov = self.X0cov
#Assign the target variable Y to either GCN group or binding site
if TARGET == 'GCN':
try:
Y = self.GCNlabels
except:
print("set_GCNlabels must be executed before spectra can be generated")
raise
else:
Y = BINDING_TYPES
#Get coverage shifted data if and update number of targets if applicable
X, NUM_TARGETS = _get_coverage_shifted_X(X0cov, Y, COVERAGE)
ENERGIES = np.linspace(LOW_FREQUENCY, HIGH_FREQUENCY, num=ENERGY_POINTS\
, endpoint=True)
self.ENERGIES = ENERGIES
self.X = X
self.Y = Y
self.NUM_TARGETS = NUM_TARGETS
self.X0cov = X0cov
self.COVERAGE = COVERAGE
self.LOW_FREQUENCY = LOW_FREQUENCY
self.HIGH_FREQUENCY = HIGH_FREQUENCY
self.ENERGY_POINTS = ENERGY_POINTS
self.MAX_COVERAGES = MAX_COVERAGES
[docs] def get_synthetic_spectra(self, NUM_SAMPLES, indices, IS_TRAINING_SET, TRAINING_ERROR=None):
"""Obtain convoluted complex synthetic spectra
Parameters
----------
NUM_SAMPLES : int
Number of spectra to generate.
indices : list
List of indices from primary datset from which to generate spectra
IS_TRAINING_SET : bool
Indicates whether primary data should be used to compute trainign or
validation set.
TRAINING_ERROR : int, str, or None
Indicates the kind of perturbations to induce in the primary training data.
If an integer the perturbations are uniform, if 'gaussian', the perturbations
are a gaussian with the same variance as that used ot pertub the validation data.
Returns
-------
X : numpy.ndarray
Coverage shifted frequencies and intensities
Y : numpy.ndarray
The target variable histograms. Either binding-type or GCN label
"""
_get_probabilities = self._get_probabilities
_xyconv = self. _xyconv
_get_sample_data = self._get_sample_data
NUM_TARGETS = self.NUM_TARGETS
X = self.X.copy()
Y = self.Y.copy()
X_sample, Y_sample, BINDING_TYPES_sample = \
_get_sample_data(X, Y, indices, IS_TRAINING_SET, TRAINING_ERROR)
probabilities = _get_probabilities(NUM_SAMPLES, NUM_TARGETS)
Xconv, yconv = _xyconv(X_sample, Y_sample, probabilities, BINDING_TYPES_sample)
#set numbers that may form denormals to zero to improve numerics
Xconv[Xconv[...]<2**-500] = 0
yconv[yconv[...]<2**-500] = 0
return (Xconv, yconv)
[docs]def fold(frequencies, intensities, LOW_FREQUENCY, HIGH_FREQUENCY, ENERGY_POINTS,FWHM, fL):
"""Generate spectra from set of frequencies and intensities
Parameters
----------
frequencies : list or numpy.ndarry
Set of molecular frequencies
intensities : list or numpy.ndarray
Intensities that correspond to frequencies
LOW_FREQUENCY : float
The lowest frequency for which synthetic spectra is generated
HIGH_FREQUENCY : float
The high frequency for which synthetic spectra is generated
ENERGY_POINTS : int
The number of points the synthetic spectra is discretized into
FWHM : float
Full-width-half-maximum of the desired spectra
fL : float
Fraction of spectra to be convoluted by a Lorentzian transform. The
remaining portion of the transoform to make up the FWHM comes sourced
from a Gaussian convoluting function.
Returns
-------
spectrum : numpy.ndarray
The spectrum that is generated from the set of frequencies and intensities.
"""
energies = np.linspace(LOW_FREQUENCY, HIGH_FREQUENCY, num=ENERGY_POINTS, endpoint=True)
energy_spacing = energies[1]-energies[0]
if FWHM < 2*energy_spacing:
raise ValueError('Function input FWHM must must be at least twice\
the energy spacing to prevent information loss. It therefore must be\
at least' + str(2*energy_spacing))
assert HIGH_FREQUENCY > LOW_FREQUENCY, "The high frequency must be greater than the low frequenc"
sigma = FWHM/(2.0 * np.sqrt(2.0 * np.log(2.)))
prefactor = 1.0/(sigma * np.sqrt(2.0 * np.pi))
#Reshape array energies to be (ENERGY_POINTS,1) dimensional
energies2D = energies.reshape((-1, 1))
#Create array that is duplicate of frequencies
freq2D = np.tile(frequencies, (ENERGY_POINTS, 1))
int2D = np.tile(intensities, (ENERGY_POINTS, 1))
#contribution of each frequency on each energy to total intensity
int_matrix = (1-fL)*int2D*prefactor*np.exp(-(freq2D-energies2D)**2/(2.0*sigma**2)) \
+ fL*int2D*2/(FWHM*np.pi*(4/FWHM**2*(freq2D-energies2D)**2+1))
spectrum = np.sum(int_matrix, axis=1)
return spectrum
[docs]def HREEL_2_scaledIR(HREEL, frequency_range=None, PEAK_CONV = 2.7 ):
"""Summary goes on one line here
Parameters
----------
HREEL : numpy.ndarray
HREEL spectra that is of size (2,n) where n is the number of points the
into which the frequency and intensity are discretized.
frequency_range : numpy.ndarray
Frequency range onto which HREEL will be interpolated after conversion to IR.
PEAK_CONV : float
The intensity is scaled by the wavenumber raised to PEAK_CONV.
IR_scaled
-------
numpy.ndarray : numpy.ndarray
The IR spectra the HREELs is converted to.
"""
if frequency_range is None:
frequency_range = np.linspace(200,2200,num=501,endpoint=True)
IR = np.interp(frequency_range, HREEL[0], HREEL[1]*HREEL[0]**PEAK_CONV, left=None, right=None, period=None)
IR_scaled = IR/np.max(IR)
return IR_scaled