Source code for jl_spectra_2_structure.jl_spectra_2_structure

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 19 11:05:24 2017

@author: lansf
"""
from __future__ import absolute_import, division, print_function
import os
import numpy as np
import json
import pkg_resources
from sklearn.cluster import KMeans
from imblearn.over_sampling import RandomOverSampler

[docs]def get_default_data_paths(adsorbate): """ Get default paths to primary data of frequencies and intensities. Parameters ---------- adsorbate : str Name of the adsorbate for which to get the default primary DFT data. Data is already provided for 'CO', 'NO', and 'C2H4'. Returns ------- nanoparticle_path : str File path where nanoparticle or single adsorbate data will be saved. isotope_path : str File path where isotope data for CO and NO will be saved. high_coverage_path : str File path where high coverage data for CO will be saved. coverage_scaling_path : str File path where coverage scaling coefficients will be saved. Notes ----- Returns default frequencies to project intensities onto as well as default paths for locations of the pure and mixture spectroscopic data. """ data_path = pkg_resources.resource_filename(__name__, 'data/') nanoparticle_path = os.path.join(data_path, 'dft_nanoparticle/single_'+adsorbate+'.json') isotope_path = os.path.join(data_path, 'dft_surface/isotope_'+adsorbate+'.json') high_coverage_path = os.path.join(data_path, 'dft_surface/high_coverage_'+adsorbate+'.json') coverage_scaling_path = os.path.join(data_path,'coverage_scaling_params_'+adsorbate+'.json') return (nanoparticle_path, isotope_path, high_coverage_path\ , coverage_scaling_path)
[docs]def get_exp_data_path(): """ Get default paths to experimental data. Returns ------- experimental_data : list list of paths to experimental data """ data_path = pkg_resources.resource_filename(__name__, 'data/') experimental_path = os.path.join(data_path, 'experimental') experimental_data = [os.path.join(experimental_path,file) for file \ in os.listdir(experimental_path)] return experimental_data
[docs]class IR_GEN: """Class for generating complex synthetic IR spectra""" def __init__(self, ADSORBATE='CO', INCLUDED_BINDING_TYPES=[1,2,3,4], TARGET='binding_type', NUM_TARGETS=None\ , nanoparticle_path=None, high_coverage_path=None, coverage_scaling_path=None,VERBOSE=False): """ Parameters ---------- ADSORBATE : str Adsorbate for which the spectra is to be generated. INCLUDED_BINDING_TYPES : list Binding types whose frequencies/intensiteis from the primary data set will be included in generating the complex spectra. TARGET : str Geometric descriptor for which the target histogram is to be gnerated. Can be binding_type, GCN, or combine_hollow_sites. If it is combine_hollow_sites then 3-fold and 4-fold sites are grouped together. NUM_TARGETS : int Number of GCN groups to predict. Only used if TARGET='GCN' and GCN must be discretized. nanoparticle_path : str File path where nanoparticle or single adsorbate json data is saved. high_coverage_path : str File path where high coverage data for CO is saved saved. coverage_scaling_path : str File path where coverage scaling coefficients are saved. VERBOSE : bool Controls the printing of status statements. Attributes ---------- BINDING_TYPES_with_4fold : list List of all binding types. TARGET : str Set during initialization. NUM_TARGETS: int Set during initialization. X0cov : numpy.ndarray Set of frequencies and intensities at low coverage. BINDING_TYPES : list Binding types to be predicted, accounts for both filtering and merging of certain sites. GCNList : list GCN values for the data points NANO_PATH : int Set during initialization. HIGH_COV_PATH : int Set during initialization. COV_SCALE_PATH : int Set during initialization. ADSORBATE : float Set during initialization. INCLUDED_BINDING_TYPES : int Set during initialization. VERBOSE : int Set during initialization. """ assert TARGET in ['binding_type','GCN','combine_hollow_sites'], "incorrect TARGET given" assert type(INCLUDED_BINDING_TYPES) in [list,tuple,np.ndarray], "Included Binding Types should be a list" #number of target variables. nano_path, isotope_path, high_cov_path\ , cov_scale_path = get_default_data_paths(ADSORBATE) if nanoparticle_path is None: nanoparticle_path = nano_path if ADSORBATE in ['NO','CO']: max_coordination=4 if coverage_scaling_path is None: coverage_scaling_path = cov_scale_path if ADSORBATE == 'CO': if high_coverage_path is None: high_coverage_path = high_cov_path elif ADSORBATE in ['C2H4']: max_coordination = 2 with open(nanoparticle_path, 'r') as infile: nanoparticle_data = json.load(infile) Xfreq_ALL = np.array(nanoparticle_data['FREQUENCIES'], dtype='float') is_local_minima = np.min(Xfreq_ALL, axis=1) > 0 BINDING_TYPES_unfiltered = np.array(nanoparticle_data['CN_ADSORBATE']) max_forces = np.array(nanoparticle_data['MAX_FORCE']) is_adsorbed = BINDING_TYPES_unfiltered >0 if TARGET in ['binding_type','combine_hollow_sites']: NUM_TARGETS = len(INCLUDED_BINDING_TYPES) correct_coordination = BINDING_TYPES_unfiltered <= max_coordination small_force = max_forces < 0.05 select_files = np.all((is_local_minima,is_adsorbed,small_force,correct_coordination),axis=0) for key in nanoparticle_data.keys(): nanoparticle_data[key] = np.array(nanoparticle_data[key])[select_files] nanoparticle_data['INTENSITIES'][nanoparticle_data['FREQUENCIES'] == 0] = 0 BINDING_TYPES_with_4fold = nanoparticle_data['CN_ADSORBATE'] if TARGET == 'combine_hollow_sites': nanoparticle_data['CN_ADSORBATE'][nanoparticle_data['CN_ADSORBATE'] == 4] = 3 NUM_TARGETS -= 1 if VERBOSE == True: print('grouping hollow sites') self.BINDING_TYPES_with_4fold = BINDING_TYPES_with_4fold self.TARGET = TARGET self.NUM_TARGETS = NUM_TARGETS self.X0cov = np.array([(nanoparticle_data['FREQUENCIES'][i], nanoparticle_data['INTENSITIES'][i]) for i in range(len(nanoparticle_data['FREQUENCIES']))]) self.BINDING_TYPES = nanoparticle_data['CN_ADSORBATE'] self.GCNList = nanoparticle_data['GCN'] self.NANO_PATH = nanoparticle_path self.HIGH_COV_PATH = high_coverage_path self.COV_SCALE_PATH = coverage_scaling_path self.ADSORBATE = ADSORBATE self.INCLUDED_BINDING_TYPES = INCLUDED_BINDING_TYPES self.VERBOSE = VERBOSE
[docs] def set_GCNlabels(self, Minimum=0, BINDING_TYPE_FOR_GCN=[1], showfigures=False, figure_directory='show'): """ Cluster GCN values into groups/classes using k-means clustering. Parameters ---------- Minimum : int Minimum number of datapoints in each cluster. If a generated cluster has fewer than this number of datapoints it is merged with the next cluster. If the last cluster has fewer than the minimum number of datapoints it is merged with the previous cluster. showfigures : bool Whether or not to generate figures visualizing the clusters and their location in GCN-space. figure_directory : str Either a directory where figures are to be saved or the string 'show' which indicates that the figure is supposed to be sent to gui output. BINDING_TYPE_FOR_GCN : list List of binding types whose GCN values are to be included in clustering. Binding-types included will have a GCN label of 1 through the number of clusters. Binding-types not included will be assigned a GCN label of zero. Attributes ---------- GCNlabels : numpy.ndarray GCN label assigned to each primary datapoint. NUM_TARGET : int Updated number of targets. If n clusters (after merging) generated by the K-means algorithm had less than the minimum number of clusters than NUM_TARGET originally instantiated by the class is reduced by n. Notes ----- Assigns each primary datapoint a GCN label based on the GCN value using k-means clustering where the number of target clusters is equal to the number of targets instantiated with the class. This is required to be run if one wishes to learn a distribution of GCN sites as GCN is continuous. K-means clustering is an partially - supervised learning technique that generates clusters/groups that are relatively evenly spaced with roughly the same number of datapoints in each cluster. """ VERBOSE = self.VERBOSE if VERBOSE == True: print('Initial number of targets: '+str(self.NUM_TARGETS)) ADSORBATE = self.ADSORBATE NUM_TARGETS = self.NUM_TARGETS GCNList = self.GCNList BINDING_TYPES = self.BINDING_TYPES GCNList_selected = GCNList[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)] KC = KMeans(n_clusters=NUM_TARGETS, random_state=0).fit(GCNList_selected.reshape(-1, 1)) KC_new = np.zeros(NUM_TARGETS, dtype='int') KC_new[0] = KC.labels_[np.argmin(GCNList_selected)] for i in range(NUM_TARGETS-1): KC_new[i+1] = KC.labels_[np.isin(KC.labels_, KC_new[0:i+1]) == False]\ [np.argmin(GCNList_selected[np.isin(KC.labels_, KC_new[0:i+1]) == False])] KC2class = dict(zip(KC_new, np.arange(1, len(KC_new)+1, dtype='int'))) KCclass = np.array([KC2class[i] for i in KC.labels_]) for i in range(NUM_TARGETS-1)[::-1]: NUM_IN_CLASS = len(KCclass[KCclass == i+2]) if NUM_IN_CLASS < Minimum: for ii in range(NUM_TARGETS)[::-1]: if KC2class[ii] == i+2: KC2class.update({ii:i+1}) KCclass = np.array([KC2class[ii] for ii in KC.labels_]) NUM_IN_CLASS = len(KCclass[KCclass == 1]) if NUM_IN_CLASS < Minimum: for i in range(NUM_TARGETS): if KC2class[i] > 1: KC2class.update({i:KC2class[i]-1}) NUM_IN_CLASS = len(KCclass[KCclass == 2]) if NUM_IN_CLASS < Minimum: for i in range(NUM_TARGETS): if KC2class[i] > 2: KC2class.update({i:KC2class[i]-1}) KCclass = np.array([KC2class[i] for i in KC.labels_]) NUM_TARGETS = len(set(KCclass)) GCNlabels = np.zeros(len(GCNList), dtype='int') GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)] = KCclass BreakPoints = np.linspace(0, 8.5, num=810) BreakLabels = KC.predict(BreakPoints.reshape(-1, 1)) BreakLabels = [KC2class[i] for i in BreakLabels] BreakMin = np.array([np.min(BreakPoints[BreakLabels == i]) for i in np.arange(1, NUM_TARGETS+1)]) BreakMax = np.array([np.max(BreakPoints[BreakLabels == i]) for i in np.arange(1, NUM_TARGETS+1)]) if showfigures == True: import matplotlib.pyplot as plt plt.figure() plt.scatter(GCNList_selected, GCNList_selected, c=KC.labels_) for i in BreakMax: plt.plot([0, i, 2*i], [2*i, i, 0], 'k-') plt.xlabel('GCN') plt.ylabel('GCN') plt.xlim([0, 8.5]) plt.ylim([0, 8.5]) BreakString = zip(np.around(BreakMin, decimals=1), np.around(BreakMax, decimals=1)) BreakString = [str(count+1)+': '+str(i[0])+'-'+str(i[1]) for count, i in enumerate(BreakString)] plt.figure() #ax = plt.subplot() plt.hist(GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)], bins=np.arange(0.5, NUM_TARGETS+1.5), rwidth=0.5) plt.xticks(range(1, NUM_TARGETS+1)) #ax.set_xticklabels(greater_than) plt.xlabel('GCN Group') plt.ylabel('DFT Samples') plt.show() print(BreakString) from matplotlib import rcParams rcParams['lines.linewidth'] = 2 rcParams['lines.markersize'] = 5 params = {'figure.autolayout': True} rcParams.update(params) if figure_directory == 'show': plt.figure() else: plt.figure(0, figsize=(3.5, 3), dpi=400) #ax = plt.subplot() plt.hist(GCNlabels[np.isin(BINDING_TYPES,BINDING_TYPE_FOR_GCN)], bins=np.arange(0.5, NUM_TARGETS+1.5), rwidth=0.5) plt.xticks(range(1, NUM_TARGETS+1)) #ax.set_xticklabels(greater_than) plt.xlabel('GCN Group') plt.ylabel('DFT Samples') if figure_directory == 'show': plt.show() else: plt.savefig(os.path.join(figure_directory,ADSORBATE+'GCN_Clustering.jpg'), format='jpg') plt.close() self.GCNlabels = GCNlabels self.NUM_TARGETS = NUM_TARGETS if VERBOSE == True: print('Final number of targets: '+str(self.NUM_TARGETS))
[docs] def _get_probabilities(self, num_samples, NUM_TARGETS): """ Get probablilities for sampling data from different classes/groups. Parameters ---------- num_samples : int number of complex spectra to generate NUM_TARGETS : int number of class/labes that are desired. Usually the number of binding-types or the number of GCN labels > 0 Returns ------- probabilities : numpy.ndarray The probabilities to select each binding-type or GCN group. Has dimensions (num_samples, NUM_TARGETS). Notes ----- Returns the a 2-D numpy.ndarray that is of length num_samples along the first dimension and NUM_TARGETS alongt the second dimesion. Elements correspond to the probability of selecting primary data point from a a specific class/group such that the array sums to 1 along the 2nd dimention. The probability assigned to the first index of the along the 2nd dimenstion is comes from a uniform distribution between 0 and 1 while the probability assigned to each following index $i = n$ comes from the uniform distribution $1/\sum{p_{i}}$ where $i$ in $p_{i}$ comes from all previous index values $i < n$. The probabilities are then shuffled along the second dimension. This probability distribution results in contribution to spectra from any given class/label is most likely zero and the likelihood of the contribution monotonically decreases as the the contribution goes to 1. This distribution ensures that all fractional contributions are as equally sampled as possible. """ #define np_shuffle to be locally in loop probabilities = np.zeros((num_samples, NUM_TARGETS)) A = np.zeros(num_samples) for i in range(NUM_TARGETS-1): probs = (1-A)*np.random.random_sample(size=num_samples) A = A+probs probabilities[:, i] = probs probabilities[:, -1] = 1 - np.sum(probabilities, axis=1) #set any slightly negative probabilities to 0 probabilities[probabilities[...] < 2**-500] = 0 [np.random.shuffle(i) for i in probabilities] return probabilities
[docs] def _perturb_spectra(self, perturbations, X, y, a=0.999, b=1.001, BINDING_TYPES=None): """ Generate pertubations of the frequencies and intensities uniformly. Parameters ---------- perturbations : int The number of perturbed primary datapoints per original datapoint. X : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. y : numpy.ndarray The array of target variables to be regressed (binding types or GCN-labels) a : int The lower boudn of the uniform distribution. b : int The upper bound of the uniform distribution. BINDING_TYPES : numpy.ndarray Binding types of the primary datapoints. Returns ------- Xperturbed : numpy.ndarray Perturbed intensities and frequencies. Has dimensions $m_{new} x n x p$ where $m_{new} = m x perturbations$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. yperturbed : numpy.ndarray Some y-variable to be regreesed such as binding-type or GCN group. BINDING_TYPES_perturbed : numpy.ndarray Binding-types of the perturbed primary datapoints. Notes ----- Returns a tuple of perturbed primary datapoints. The goal is to improve the predictive range of the regressed model by expanding the datapoints and accounting for error in the primary data (DFT). Spectra is pertubed according to a uniform distribution between $a$ and $b$. """ X = X.copy(); y = y.copy() perturbed_values = (b-a)*np.random.random_sample((X.shape[0]*perturbations , X.shape[1], X.shape[2]))+a Xperturbed = X.repeat(perturbations, axis=0)*perturbed_values yperturbed = y.repeat(perturbations, axis=0) if BINDING_TYPES is not None: BINDING_TYPES = BINDING_TYPES.copy() BINDING_TYPES_perturbed = BINDING_TYPES.repeat(perturbations,axis=0) else: BINDING_TYPES_perturbed = None return (Xperturbed, yperturbed, BINDING_TYPES_perturbed)
[docs] def _mixed_lineshape(self, FWHM, fL, ENERGY_POINTS, energy_spacing): """ Convolute spectra with Lorentzian or Gaussian induce broadening. Parameters ---------- FWHM : float Full-width-half-maximum of the desired spectra fL : float Fraction of spectra to be convoluted by a Lorentzian transform. The remaining portion of the transoform to make up the FWHM comes sourced from a Gaussian convoluting function. ENERGY_POINTS : int Number of energy points in the desired spectra energy_spacing : float spacing of the energy points Returns ------- transform : numpy.ndarray transform that is convolved with a spectra that has a narrow line width in order to produce spectra with greater line widths. Notes ----- Accepts a full-width-half-maximum, a fraction of Lorentzian, energy spacing and number of points and produces the transform with which the spectra is convolved through fourier convolution in order to produce realistic experimental spectra. """ numpoints = 2*ENERGY_POINTS-1 #x is the x-axis of the transform whose values are spaced with energy_spacing, #and centered at zero wuch that the $total points = 2 * ENERGY_POINTS + 1$ x = energy_spacing*(np.arange(numpoints, dtype='int')-int((numpoints-1)/2)) b = 0.5/np.sqrt(np.log(2)) L_prefactor = 4*2/(FWHM*np.pi) G_prefactor = 4*1/(b*FWHM*np.sqrt(np.pi)) specL = 1.0/(1.0+4.0*(x/FWHM)**2) specG = np.exp(-(x/(b*FWHM))**2) transform = fL*L_prefactor*specL+(1-fL)*G_prefactor*specG return transform
[docs] def _coverage_shift(self, X, BINDING_TYPES, SELF_COVERAGE, TOTAL_COVERAGE): """ Shift frqequencies and intensities to account for coverages effects. Parameters ---------- X : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. BINDING_TYPES : numpy.ndarray Binding types of the primary datapoints. SELF_COVERAGE : numpy.ndarray of floats Relative spatial coverage of each binding-type TOTAL_COVERAGE : numpy.ndarray of floats Relative combined coverage fo which the primary data point "sits" Returns ------- Xcov : numpy.ndarray 3-D numpy array of frequencies and intensities that have been shifted to account for coverage effects. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. """ coverage_scaling_path = self.COV_SCALE_PATH ones = np.ones(X.shape[0]) Xfrequencies = X[:, 0].copy() Xintensities = X[:, 1].copy() if type(SELF_COVERAGE) == int or type(SELF_COVERAGE) == float: SELF_COVERAGE = ones*SELF_COVERAGE if type(TOTAL_COVERAGE) == int or type(TOTAL_COVERAGE) == float: TOTAL_COVERAGE = ones*TOTAL_COVERAGE with open(coverage_scaling_path, 'r') as infile: Coverage_Scaling = json.load(infile) #maximum coverage obtained from Norton et al. 1982 ABS_COVERAGE = SELF_COVERAGE*0.096 TOTAL_COVERAGE_ABS = TOTAL_COVERAGE*0.096 #site-type: atop, bridge,threefold,fourfold CO_STRETCH_IDX = np.argmax(Xfrequencies, axis=1) #Copy frequencies and update CO stretch before updating all, then set CO stretch CO_frequencies = Xfrequencies.copy()[np.arange(len(Xfrequencies)), CO_STRETCH_IDX] CO_intensities = Xintensities.copy()[np.arange(len(Xintensities)), CO_STRETCH_IDX] for i in range(4): CO_frequencies[BINDING_TYPES == i+1] =\ (CO_frequencies[BINDING_TYPES == i+1]\ * (Coverage_Scaling['CO_FREQ'][i]['SELF_CO_PER_A2']\ * ABS_COVERAGE[BINDING_TYPES == i+1]\ + Coverage_Scaling['CO_FREQ'][i]['CO_PER_A2']\ * TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1] + 1)) CO_intensities[BINDING_TYPES == i+1] = (CO_intensities[BINDING_TYPES == i+1]\ *np.exp(Coverage_Scaling['CO_INT_EXP']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1])) Xfrequencies[BINDING_TYPES == i+1] = (Xfrequencies[BINDING_TYPES == i+1]\ *(Coverage_Scaling['PTCO_FREQ']['SELF_CO_PER_A2']*ABS_COVERAGE[BINDING_TYPES == i+1]\ +Coverage_Scaling['PTCO_FREQ']['CO_PER_A2']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1]\ +1).reshape((-1, 1))) Xintensities[BINDING_TYPES == i+1] = (Xintensities[BINDING_TYPES == i+1]\ *np.exp(Coverage_Scaling['PTCO_INT_EXP']*TOTAL_COVERAGE_ABS[BINDING_TYPES == i+1]).reshape((-1,1))) Xfrequencies[np.arange(X.shape[0]), CO_STRETCH_IDX] = CO_frequencies #Xfrequencies cannot be less than 0 Xfrequencies[Xfrequencies[...]<2**-500] = 0 Xintensities[np.arange(X.shape[0]), CO_STRETCH_IDX] = CO_intensities Xcov = np.stack((Xfrequencies,Xintensities),axis=1) return Xcov
[docs] def scaling_factor_shift(self, X): """ Shift frequencies by a scaling factor to match experiment. Parameters ---------- X : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. Returns ------- X : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. Notes ----- A scaling factor shift accounts for systematic errors in the functional to over- or under-bind as well as systematic erros in the harmonic approximation. Computed according to https://cccbdb.nist.gov/vibnotes.asp """ Xfrequencies = X[:, 0].copy() Xintensities = X[:, 1].copy() CO_STRETCH_IDX = Xfrequencies > 1000 MC_IDX = Xfrequencies < 1000 #Scaling Factor determined from comparing experiment to DFT #uncertainties are 0.00097 and 0.00182 respectively SFCO = 1.0121 SFMC = 0.969 Xfrequencies[CO_STRETCH_IDX] = Xfrequencies[CO_STRETCH_IDX]*SFCO Xfrequencies[MC_IDX] = Xfrequencies[MC_IDX]*SFMC X = np.stack((Xfrequencies,Xintensities),axis=1) return X
[docs] def _perturb_and_shift(self,perturbations, X, y, BINDING_TYPES=None): """ Shift spectra with scaling factor that has gaussian error. Parameters ---------- perturbations : int The number of perturbed primary datapoints per original datapoint. X : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities. y : numpy.ndarray The array of target variables to be regressed (binding types or GCN-labels) BINDING_TYPES : numpy.ndarray Binding types of the primary datapoints. Returns ------- Xperturbed : numpy.ndarray Perturbed intensities and frequencies. Has dimensions $m_{new} x n x p$ where $m_{new} = m x perturbations$ is the number of primary datapoints, $n = 2$ and $p$ is the number of frequencies/intensities for each datapoint. yperturbed : numpy.ndarray Some y-variable to be regreesed such as binding-type or GCN group. BINDING_TYPES_perturbed : numpy.ndarray Binding-types of the perturbed primary datapoints. Notes ----- Returns a tuple of perturbed primary datapoints. The goal is to improve the incorporate error in the scaling factor into the validation and test set. Scaling factor is perturbed according to its calculated standard error. """ ADSORBATE = self.ADSORBATE X = X.copy() y = y.copy() Xperturbed = X.repeat(perturbations, axis=0) yperturbed = y.repeat(perturbations, axis=0) if BINDING_TYPES is not None: BINDING_TYPES = BINDING_TYPES.copy() BINDING_TYPES_perturbed = BINDING_TYPES.repeat(perturbations,axis=0) else: BINDING_TYPES_perturbed = None Xfrequencies = Xperturbed[:, 0].copy() Xintensities = Xperturbed[:, 1].copy() CO_STRETCH_IDX = Xfrequencies > 1000 MC_IDX = Xfrequencies < 1000 #Scaling Factor determined from comparing experiment to DFT #uncertainties are 0.00097 and 0.00182 respectively SFCO = 1.0121 SFMC = 0.969 if ADSORBATE == 'CO': perturb_CO_freq = 0.00097044 * np.random.standard_normal((Xfrequencies[CO_STRETCH_IDX].shape)) + SFCO perturb_MC_freq = 0.00183104 * np.random.standard_normal((Xfrequencies[MC_IDX].shape)) + SFMC else: perturb_CO_freq = 0.00097044 * np.random.standard_normal((Xfrequencies[CO_STRETCH_IDX].shape)) + 1 perturb_MC_freq = 0.00183104 * np.random.standard_normal((Xfrequencies[MC_IDX].shape)) + 1 perturb_CO_int = 0.00097044 * np.random.standard_normal((Xintensities[CO_STRETCH_IDX].shape)) + 1 perturb_MC_int = 0.00183104 * np.random.standard_normal((Xintensities[MC_IDX].shape)) + 1 Xfrequencies[CO_STRETCH_IDX] *= perturb_CO_freq Xfrequencies[MC_IDX] *= perturb_MC_freq Xintensities[CO_STRETCH_IDX] *= perturb_CO_int Xintensities[MC_IDX] *= perturb_MC_int Xperturbed_and_shifted = np.stack((Xfrequencies,Xintensities),axis=1) Xperturbed_and_shifted[Xperturbed_and_shifted[...]<2**-500] = 0 return (Xperturbed_and_shifted, yperturbed, BINDING_TYPES_perturbed)
[docs] def _generate_spectra(self, Xfrequencies, Xintensities, ENERGIES): """ Convert set of frequencies and intensities to spectra with minimal line width by Gaussian convolution. Parameters ---------- Xfrequencies : numpy.ndarray 2-D numpy array of frequencies. The ndarray has dimensions $m x n$ where $m$ is the number of primary datapoints, $n$ is the number of frequencies/intensities for each datapoint. Xintensities : numpy.ndarray 2-D numpy array of intensities. The ndarray has dimensions $m x n$ where $m$ is the number of primary datapoints, $n$ is the number of frequencies/intensities for each datapoint. ENERGIES : numpy.ndarray Numpy array of energies onto which frequencies and itensities will be convolved. Returns ------- intmesh : numpy.ndarray numpy ndarray with dimensions $m x p$ where $m$ is the number of primary datapoints and $p$ is the number of energy points onto which the frequencies and intensities aree projected with Fourier convolution. Notes ----- This preconvolution is necessary for numerical reasons before data is convoluted to induce greater spectral broadening. """ ENERGY_POINTS = len(ENERGIES) energy_spacing = ENERGIES[1]-ENERGIES[0] FWHM = 2*energy_spacing sigma = FWHM/(2.0 * np.sqrt(2.0 * np.log(2.))) prefactor = 1.0/(sigma * np.sqrt(2.0 * np.pi)) energies2D = ENERGIES.reshape((-1, 1)) int_mesh = np.zeros((Xfrequencies.shape[0], ENERGY_POINTS)) #mesh everything on energy grids with a FWHM of twice the energy spacing for i in range(Xfrequencies.shape[0]): freq2D = np.tile(Xfrequencies[i], (ENERGY_POINTS, 1)) int2D = np.tile(Xintensities[i], (ENERGY_POINTS, 1)) temp = int2D*prefactor*np.exp(-(freq2D-energies2D)**2/(2.0*sigma**2)) int_mesh[i] = np.sum(temp, axis=1) return int_mesh
[docs] def _xyconv(self, X_sample, Y_sample, probabilities, BINDING_TYPES_sample): """ Covolutes balananced sample of primary data and generates complex mixed spectra. Parameters ---------- perturbations : int The number of perturbed primary datapoints per original datapoint. X_sample : numpy.ndarray 3-D numpy array of frequencies and intensities. The ndarray has dimensions $m x n x p$ where $m$ is the number of primary datapoints after data class balancing , $n = 2$ and $p$ is the number of frequencies/intensities. Y_sample : numpy.ndarray The array of target variables to be regressed (binding types or GCN-labels) probabilities : numpy.ndarray The probabilities to select each binding-type or GCN group. Has dimensions (num_samples, NUM_TARGETS). BINDING_TYPES_sample : numpy.ndarray Binding types of the primary datapoints after data class balancing. Returns ------- Xconv : numpy.ndarray Convoluted complex spectra of dimensions $m x n$ where $m$ is the desired number of samples and $n$ is the number of energy points. yconv : numpy.ndarray Fraction of occupied binding-types or GCN groups that contribute to the total spectra. yconv has dimensions $m x p$ where $m$ is the desired number of samples and $p$ is the the number of targets. """ get_probabilities = self._get_probabilities _mixed_lineshape = self._mixed_lineshape _coverage_shift = self._coverage_shift _generate_spectra = self._generate_spectra NUM_TARGETS = self.NUM_TARGETS ENERGY_POINTS = self.ENERGY_POINTS COVERAGE = self.COVERAGE TARGET = self.TARGET INCLUDED_BINDING_TYPES = self.INCLUDED_BINDING_TYPES MAX_COVERAGES = self.MAX_COVERAGES ENERGIES = self.ENERGIES if TARGET == 'GCN': GCNlabels = self.GCNlabels energy_spacing = ENERGIES[1]-ENERGIES[0] Xfrequencies = X_sample[:, 0].copy() Xintensities = X_sample[:, 1].copy() np.random.shuffle(probabilities) num_samples = len(probabilities) num_simple_spectra = np.random.randint(1, high=201, size=num_samples) fLs = np.random.sample(num_samples) FWHMs = np.random.uniform(low=2, high=75, size=num_samples) Xconv = np.zeros((num_samples, ENERGY_POINTS)) yconv = np.zeros((num_samples, NUM_TARGETS)) y_mesh = np.zeros((Y_sample.size, NUM_TARGETS)) #subtracting MIN_Y ensures that the number of indices in #ymin is equal to the labels, even if the labels don't start at 0 if TARGET == 'GCN': MIN_Y = min(GCNlabels[GCNlabels>0]) else: MIN_Y = min(INCLUDED_BINDING_TYPES) y_mesh[np.arange(Y_sample.size), Y_sample-MIN_Y] = 1 sample_indices = np.arange(Y_sample.size) parray = np.zeros(Y_sample.size) shift_vector = np.zeros(Y_sample.size) if TARGET == 'GCN' or COVERAGE == 'low' or type(COVERAGE) in [float,int]: int_mesh = _generate_spectra(Xfrequencies, Xintensities,ENERGIES) else: coverage_parray = get_probabilities(num_samples, 11) coverage_totals = np.random.random_sample(size=[num_samples,10]) for i in range(num_samples): shift_vector[...]=0 non_zeros = np.random.randint(low=1, high=Y_sample.size) indices_to_shift = np.random.choice(sample_indices,size=non_zeros,replace=False) shift_vector[indices_to_shift] = np.random.random_sample(size=non_zeros) for ii in range(NUM_TARGETS): ii_MIN_Y = ii+MIN_Y shift_vector_sum = shift_vector[Y_sample == ii_MIN_Y].sum() if shift_vector_sum >0: shift_vector[Y_sample == ii_MIN_Y] /= shift_vector_sum parray[Y_sample == ii_MIN_Y] = probabilities[i, ii] * shift_vector[Y_sample==ii_MIN_Y] else: parray[Y_sample == ii+MIN_Y] = probabilities[i, ii] parray /= np.sum(parray) indices_primary = np.random.choice(sample_indices, size=num_simple_spectra[i], replace=True, p=parray) if TARGET == 'GCN' or COVERAGE == 'low' or type(COVERAGE) in [float,int]: combined_mesh = np.sum(int_mesh[indices_primary], axis=0) else: #Allow different coverages to exist on the surfaces. #Allow the different binding types to have different coverages #initialize coverages SELF_COVERAGE = np.random.random_sample(num_simple_spectra[i]) TOTAL_COVERAGE = np.zeros_like(SELF_COVERAGE) COVERAGE_INDICES = np.random.choice([0,1,2,3,4,5,6,7,8,9,10], size=num_simple_spectra[i],replace=True, p=coverage_parray[i]) #self coverage corresponding to index of 0 is island so it is single coverage and is skipped for ii in sorted(set(COVERAGE_INDICES.tolist()+[0]))[1:]: TOTAL_COVERAGE[COVERAGE_INDICES == ii] = coverage_totals[i][ii-1] #Set coverage of each spectra to be total coverage divided by the number of spectra being combined SELF_COVERAGE[COVERAGE_INDICES == ii ] = coverage_totals[i][ii-1]/TOTAL_COVERAGE[COVERAGE_INDICES == ii].size #update self coverage of indentical binding types to be the same (their sum) for iii in INCLUDED_BINDING_TYPES: SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary]==iii),axis=0)] \ = np.sum(SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary]==iii),axis=0)]) #decrease maximum coverage at less favorable sites to improve prediction score for count, max_coverage in enumerate(MAX_COVERAGES): SELF_COVERAGE[BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]] *= max_coverage #Ensure that Total coverage is compatible with self_coverage for ii in sorted(set(COVERAGE_INDICES.tolist()+[0]))[1:]: coverage_factor = 0 for count, max_coverage in enumerate(MAX_COVERAGES): coverage_factor += max_coverage * \ SELF_COVERAGE[np.all((COVERAGE_INDICES == ii,BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]), axis=0)].size TOTAL_COVERAGE[COVERAGE_INDICES == ii] *= coverage_factor/TOTAL_COVERAGE[COVERAGE_INDICES == ii].size TOTAL_COVERAGE[COVERAGE_INDICES==0] = SELF_COVERAGE[COVERAGE_INDICES==0] Xcov = _coverage_shift(X_sample[indices_primary], BINDING_TYPES_sample[indices_primary], SELF_COVERAGE, TOTAL_COVERAGE) Xcovfrequencies = Xcov[:, 0].copy() Xcovintensities = Xcov[:, 1].copy() int_mesh = _generate_spectra(Xcovfrequencies,Xcovintensities,ENERGIES) for count, max_coverage in enumerate(MAX_COVERAGES): int_mesh[BINDING_TYPES_sample[indices_primary] == INCLUDED_BINDING_TYPES[count]] *= max_coverage combined_mesh = np.sum(int_mesh, axis=0) yconv[i] = np.sum(y_mesh[indices_primary], axis=0, dtype='int') transform = _mixed_lineshape(FWHMs[i], fLs[i], ENERGY_POINTS, energy_spacing) Xconv[i] = np.convolve(combined_mesh, transform, mode='valid') #This part is to adjust the tabulated contribution to match that of the adjusted spectra for max coverage of bridge, 3-fold and 4-fold if COVERAGE == 'high': if TARGET == 'binding_type': for count, max_coverage in enumerate(MAX_COVERAGES): yconv[:,count] *= max_coverage if TARGET == 'combine_hollow_sites': for count, max_coverage in enumerate(MAX_COVERAGES[0:len(MAX_COVERAGES)-1]): yconv[:,count] *= max_coverage #10**-3 accounts for noise in experimental spectra Xconv += 10**-3*np.max(Xconv, axis=1).reshape((-1, 1))*np.random.random_sample(Xconv.shape) #normalize so max X is 1 and make y a set of fractions that sum to 1 Xconv /= np.max(Xconv, axis=1).reshape((-1, 1)) yconv /= np.sum(yconv, axis=1).reshape((-1, 1)) return (Xconv, yconv)
[docs] def add_noise(self, Xconv_main, Xconv_noise, noise2signalmax=0.67): """ Adds two sets of complex spectra, multiplying one by a uniform random variable between 0 and 0.67 which is treated as noise. Parameters ---------- Xconv_main : numpy.ndarray Convoluted complex spectra of dimensions $m x n$ where $m$ is the desired number of samples and $n$ is the number of energy points. Xconv_noise : numpy.ndarray Convoluted complex spectra of dimensions $m x n$ where $m$ is the desired number of samples and $n$ is the number of energy points. This spectra is treated as noise. Returns ------- X_noisey : numpy.ndarray Convoluted complex spectra of dimensions $m x n$ where $m$ is the desired number of samples and $n$ is the number of energy points. Data has noise from Xconv_noise. """ X_noisey = np.zeros((len(Xconv_main), len(Xconv_main[0]))) noise_value = np.random.uniform(low=0, high=noise2signalmax, size=len(Xconv_main)) noise_indices = np.arange(len(Xconv_noise)) noise_sample = np.random.choice(noise_indices, size=len(Xconv_main), replace=True) for i in range(len(Xconv_main)): X_noisey[i] = Xconv_main[i] + Xconv_noise[noise_sample[i]]*noise_value[i] X_noisey = X_noisey/np.max(X_noisey, axis=1).reshape(-1, 1) X_noisey[abs(X_noisey[...])<2**-500] = 0 return X_noisey
[docs] def _get_coverage_shifted_X(self, X0cov, Y, COVERAGE): """Get frequencies and intensities shifted with coveage. Parameters ---------- X0cov : numpy.ndarray Numpy array of size (n,2) where $n$ is the number of frequency and intensity pairs. Y : numpy.ndarray Target classes/groups (binding-types or GCN labels) COVERAGE : str or float The coverage at which the synthetic spectra is generated. If high, spectra at various coverages is generated. Returns ------- X : numpy.ndarray Frequencies and intensities shifted to account for coverage NUM_TARGETS : int Number of target classes/groups. Updated if TARGET is GCN and coverage is 'high' """ _coverage_shift = self._coverage_shift high_coverage_path = self.HIGH_COV_PATH VERBOSE = self.VERBOSE TARGET = self.TARGET BINDING_TYPES_with_4fold = self.BINDING_TYPES_with_4fold NUM_TARGETS = self.NUM_TARGETS #Adding Data for Extended Surfaces if COVERAGE == 'high' and TARGET == 'GCN': #reset NUM_TARGETS if this function has already been run. NUM_TARGETS = len(set(Y[Y>0])) if VERBOSE == True: print('Adding high-coverage low index planes') print('Initial number of targets: '+str(NUM_TARGETS)) with open(high_coverage_path, 'r') as infile: HC_Ext = json.load(infile) HC_CNPt = np.sort(np.array(list(set([np.min(i) for i in HC_Ext['CN_METAL']])))) NUM_TARGETS += len(HC_CNPt) HC_frequencies = [] HC_intensities = [] HC_classes = [] max_freqs = np.max([len(i) for i in HC_Ext['FREQUENCIES']]) for counter, i in enumerate(HC_CNPt): for ii in range(len(HC_Ext['CN_METAL'])): if np.min(HC_Ext['CN_METAL'][ii]) == i: HC_classes.append(np.max(Y)+counter+1) num_atop = len(np.array(HC_Ext['CN_ADSORBATE'][ii] )[np.array(HC_Ext['CN_ADSORBATE'][ii]) == 1]) offset = max_freqs-len(HC_Ext['FREQUENCIES'][ii]) HC_frequencies.append(np.pad(HC_Ext['FREQUENCIES'][ii], (0, offset) , 'constant', constant_values=0)) HC_intensities.append(np.pad(HC_Ext['INTENSITIES'][ii], (0, offset) , 'constant', constant_values=0)/num_atop) HC_classes = np.array(HC_classes) #Change all high-coverage classes to just max(y)+1 #until more accurate experiments and DFT are available NUM_TARGETS = NUM_TARGETS - len(HC_CNPt) + 1 HC_classes[...] = NUM_TARGETS # HC_frequencies = np.array(HC_frequencies) HC_intensities = np.array(HC_intensities) HC_X = np.array([np.array((HC_frequencies[i], HC_intensities[i])) for i in range(len(HC_frequencies))]) offset = max_freqs-len(X0cov[0][0]) X = np.pad(X0cov, ((0, 0), (0, 0), (0, offset)), 'constant', constant_values=0) if VERBOSE == True: print('Final number of targets: '+str(NUM_TARGETS)) elif COVERAGE == 'high' and TARGET in ['binding_type', 'combine_hollow_sites']: if VERBOSE == True: print('testing all coverages') X = X0cov elif type(COVERAGE) == int or type(COVERAGE) == float: if VERBOSE == True: print('Relative coverage is ' + str(COVERAGE)) X = _coverage_shift(X0cov, BINDING_TYPES_with_4fold, COVERAGE, COVERAGE) elif COVERAGE == 'low': X = X0cov if COVERAGE == 'high' and TARGET == 'GCN': self.HC_X = HC_X self.HC_classes = HC_classes return X, NUM_TARGETS
[docs] def _add_high_coverage_data(self, X, Y): """method for adding high coverage GCN data to set of spectra Parameters ---------- X : numpy.ndarray Zero coverage frequencies and intensities Y : numpy.ndarray GCN labels Returns ------- X_new : numpy.ndarray Frequencies and intensities with high coveage data Y_new : numpy.ndarray New taret vector with high covearge GCN data added """ _perturb_spectra = self._perturb_spectra HC_X = self.HC_X HC_classes = self.HC_classes X_new = X.copy() Y_new = Y.copy() HC_X_expanded, HC_classes_expanded, _ = _perturb_spectra(5, HC_X, HC_classes , a=0.999, b=1.001) X_new = np.concatenate((X_new, HC_X_expanded), axis=0) Y_new = np.concatenate((Y_new, HC_classes_expanded), axis=0) return (X_new, Y_new)
[docs] def _get_balanced_data(self, X_new, Y_new, indices): """Correct for imbalances in the data for improved learning Parameters ---------- X_new : numpy.ndarray Full set of frequencies and intensities Y_new : numpy.ndarray class/groups that are imbalanced indices : list Indices for which individual X_new and Y)new will be selected. Returns ------- X_balanced : numpy.ndarray Frequencies and intensities corresponding to balanced Y Y_balanced : numpy.ndarray Balanced set of classes/groups. BINDING_TYPES_balanced : numpy.ndarray Balanced set of binding-types that correspond to Y_balanced """ TARGET = self.TARGET BINDING_TYPES_with_4fold = self.BINDING_TYPES_with_4fold COVERAGE = self.COVERAGE if len(set(Y_new[indices])) >1: indices_balanced, Y_balanced = RandomOverSampler().fit_sample(np.array(indices).reshape(-1,1),Y_new[indices].copy()) else: indices_balanced = np.array(indices) Y_balanced = Y_new[indices] X_balanced = X_new[indices_balanced.flatten()] #BINDING_TYPES_balanced used only if COVERAGE == 'high' and (TARGET == 'binding_type' or TARGET == 'combine_hollow_sites') if COVERAGE == 'high' and TARGET in ['binding_type', 'combine_hollow_sites']: BINDING_TYPES_balanced = BINDING_TYPES_with_4fold[indices_balanced.flatten()] else: BINDING_TYPES_balanced = None return X_balanced, Y_balanced, BINDING_TYPES_balanced
[docs] def _scale_and_perturb(self, X_balanced, Y_balanced, BINDING_TYPES_balanced\ ,IS_TRAINING_SET, TRAINING_ERROR): """Apply scaling factor and pertub by error associated with scaling factor. Parameters ---------- X_balanced: numpy.ndarray Full set of frequencies and intensities corresponding to balanced data Y_balanced : numpy.ndarray Balanced class/groups BINDING_TYPES_balanced : list Balaned bindig types that correspond to spectra and target data IS_TRAINING_SET : bool Indicates whether synthetic spectra to be generated is for training or validating TRAINING_ERROR : float or str Tag that indicates treatment of error in scaling factor for training data. Returns ------- X_sample : numpy.ndarray Sample of frequencies and intensities that will be mixed and then convoluted. Y_sample : numpy.ndarray Targets classes that will be tabulated to generated fractional contributions to spectra BINDING_TYPES_sample : numpy.ndarray Binding-types that will be used in coverage scaling if scaling is set to 'high' """ _perturb_spectra = self._perturb_spectra scaling_factor_shift = self.scaling_factor_shift _perturb_and_shift = self._perturb_and_shift if IS_TRAINING_SET == True and type(TRAINING_ERROR) in [float,int]: X_balanced = scaling_factor_shift(X_balanced) X_sample, Y_sample, BINDING_TYPES_sample = _perturb_spectra(\ 5, X_balanced, Y_balanced, a=1-TRAINING_ERROR, b=1+TRAINING_ERROR, BINDING_TYPES=BINDING_TYPES_balanced) elif IS_TRAINING_SET == False or TRAINING_ERROR == 'gaussian': X_sample, Y_sample, BINDING_TYPES_sample = _perturb_and_shift(5, X_balanced, Y_balanced, BINDING_TYPES_balanced) elif IS_TRAINING_SET == True and TRAINING_ERROR is None: X_balanced = scaling_factor_shift(X_balanced) X_sample = X_balanced Y_sample = Y_balanced BINDING_TYPES_sample = BINDING_TYPES_balanced return X_sample, Y_sample, BINDING_TYPES_sample
[docs] def _get_sample_data(self, X, Y, indices, IS_TRAINING_SET, TRAINING_ERROR): """Apply scaling factor and pertub by error associated with scaling factor. Parameters ---------- X : numpy.ndarray Zero coverage frequencies and intensities Y : numpy.ndarray GCN labels indices : list Indices for which individual X_new and Y)new will be selected. IS_TRAINING_SET : bool Indicates whether synthetic spectra to be generated is for training or validating TRAINING_ERROR : float or str Tag that indicates treatment of error in scaling factor for training data. Returns ------- X_sample : numpy.ndarray Sample of frequencies and intensities that will be mixed and then convoluted. Y_sample : numpy.ndarray Targets classes that will be tabulated to generated fractional contributions to spectra BINDING_TYPES_sample : numpy.ndarray Binding-types that will be used in coverage scaling if scaling is set to 'high' """ _scale_and_perturb = self._scale_and_perturb _add_high_coverage_data = self._add_high_coverage_data _get_balanced_data = self._get_balanced_data TARGET = self.TARGET COVERAGE = self.COVERAGE #add high coverage GCN data and update indices if necessary if COVERAGE == 'high' and TARGET == 'GCN': X_new, Y_new = _add_high_coverage_data(X, Y) indices = list(indices) + np.arange(Y_new.size)[Y.size:].tolist() else: X_new = X.copy() Y_new = Y.copy() #balance data based on imbalanced classes/groups X_balanced, Y_balanced, BINDING_TYPES_balanced = _get_balanced_data(X_new, Y_new, indices) #adding perturbations for improved fitting by account for frequency and intensity errors from DFT X_sample, Y_sample, BINDING_TYPES_sample = _scale_and_perturb(X_balanced, Y_balanced, BINDING_TYPES_balanced\ ,IS_TRAINING_SET, TRAINING_ERROR) return X_sample, Y_sample, BINDING_TYPES_sample
[docs] def set_spectra_properties(self, COVERAGE=None, MAX_COVERAGES = [1,1,1,1]\ , LOW_FREQUENCY=200, HIGH_FREQUENCY=2200, ENERGY_POINTS=501): """Set spectra specific properties Parameters ---------- COVERAGE : str or float The coverage at which the synthetic spectra is generated. If high, spectra at various coverages is generated. MAX_COVERAGES : list Maximum coverages allowed for each binding-type if COVERAGE is set to 'high' LOW_FREQUENCY : float The lowest frequency for which synthetic spectra is generated HIGH_FREQUENCY : float The high frequency for which synthetic spectra is generated ENERGY_POINTS : int The number of points the synthetic spectra is discretized into Attributes ---------- X : numpy.ndarray Coverage shifted frequencies and intensities Y : The target variables. Either binding-type or GCN label NUM_TARGETS : int The number of binding-types or GCN labels. This is set by the user and can be altered by set_gcn_labels() and _get_coverage_shifted_X X0cov : numpy.ndarray The zero coverage frequency and intensity pairs COVERAGE : str or float The COVERAGE set by the user. LOW_FREQUENCY : float The low frequency set by the user. HIGH_FREQUENCY : float The high frequency set by the user ENERGY_POINTS : int The number of energy points set by the user. MAX_COVERAGES : list List of maximum coverages set by the user. Notes ----- This function calls _get_coverage_shifted_X in order to shift X frequencies according to the specified coverage. """ assert type(COVERAGE) == float or COVERAGE==1 or COVERAGE \ in ['low', 'high'], "Coverage should be a float, 'low', or 'high'." _get_coverage_shifted_X = self._get_coverage_shifted_X TARGET = self.TARGET BINDING_TYPES = self.BINDING_TYPES X0cov = self.X0cov #Assign the target variable Y to either GCN group or binding site if TARGET == 'GCN': try: Y = self.GCNlabels except: print("set_GCNlabels must be executed before spectra can be generated") raise else: Y = BINDING_TYPES #Get coverage shifted data if and update number of targets if applicable X, NUM_TARGETS = _get_coverage_shifted_X(X0cov, Y, COVERAGE) ENERGIES = np.linspace(LOW_FREQUENCY, HIGH_FREQUENCY, num=ENERGY_POINTS\ , endpoint=True) self.ENERGIES = ENERGIES self.X = X self.Y = Y self.NUM_TARGETS = NUM_TARGETS self.X0cov = X0cov self.COVERAGE = COVERAGE self.LOW_FREQUENCY = LOW_FREQUENCY self.HIGH_FREQUENCY = HIGH_FREQUENCY self.ENERGY_POINTS = ENERGY_POINTS self.MAX_COVERAGES = MAX_COVERAGES
[docs] def get_synthetic_spectra(self, NUM_SAMPLES, indices, IS_TRAINING_SET, TRAINING_ERROR=None): """Obtain convoluted complex synthetic spectra Parameters ---------- NUM_SAMPLES : int Number of spectra to generate. indices : list List of indices from primary datset from which to generate spectra IS_TRAINING_SET : bool Indicates whether primary data should be used to compute trainign or validation set. TRAINING_ERROR : int, str, or None Indicates the kind of perturbations to induce in the primary training data. If an integer the perturbations are uniform, if 'gaussian', the perturbations are a gaussian with the same variance as that used ot pertub the validation data. Returns ------- X : numpy.ndarray Coverage shifted frequencies and intensities Y : numpy.ndarray The target variable histograms. Either binding-type or GCN label """ _get_probabilities = self._get_probabilities _xyconv = self. _xyconv _get_sample_data = self._get_sample_data NUM_TARGETS = self.NUM_TARGETS X = self.X.copy() Y = self.Y.copy() X_sample, Y_sample, BINDING_TYPES_sample = \ _get_sample_data(X, Y, indices, IS_TRAINING_SET, TRAINING_ERROR) probabilities = _get_probabilities(NUM_SAMPLES, NUM_TARGETS) Xconv, yconv = _xyconv(X_sample, Y_sample, probabilities, BINDING_TYPES_sample) #set numbers that may form denormals to zero to improve numerics Xconv[Xconv[...]<2**-500] = 0 yconv[yconv[...]<2**-500] = 0 return (Xconv, yconv)
[docs]def fold(frequencies, intensities, LOW_FREQUENCY, HIGH_FREQUENCY, ENERGY_POINTS,FWHM, fL): """Generate spectra from set of frequencies and intensities Parameters ---------- frequencies : list or numpy.ndarry Set of molecular frequencies intensities : list or numpy.ndarray Intensities that correspond to frequencies LOW_FREQUENCY : float The lowest frequency for which synthetic spectra is generated HIGH_FREQUENCY : float The high frequency for which synthetic spectra is generated ENERGY_POINTS : int The number of points the synthetic spectra is discretized into FWHM : float Full-width-half-maximum of the desired spectra fL : float Fraction of spectra to be convoluted by a Lorentzian transform. The remaining portion of the transoform to make up the FWHM comes sourced from a Gaussian convoluting function. Returns ------- spectrum : numpy.ndarray The spectrum that is generated from the set of frequencies and intensities. """ energies = np.linspace(LOW_FREQUENCY, HIGH_FREQUENCY, num=ENERGY_POINTS, endpoint=True) energy_spacing = energies[1]-energies[0] if FWHM < 2*energy_spacing: raise ValueError('Function input FWHM must must be at least twice\ the energy spacing to prevent information loss. It therefore must be\ at least' + str(2*energy_spacing)) assert HIGH_FREQUENCY > LOW_FREQUENCY, "The high frequency must be greater than the low frequenc" sigma = FWHM/(2.0 * np.sqrt(2.0 * np.log(2.))) prefactor = 1.0/(sigma * np.sqrt(2.0 * np.pi)) #Reshape array energies to be (ENERGY_POINTS,1) dimensional energies2D = energies.reshape((-1, 1)) #Create array that is duplicate of frequencies freq2D = np.tile(frequencies, (ENERGY_POINTS, 1)) int2D = np.tile(intensities, (ENERGY_POINTS, 1)) #contribution of each frequency on each energy to total intensity int_matrix = (1-fL)*int2D*prefactor*np.exp(-(freq2D-energies2D)**2/(2.0*sigma**2)) \ + fL*int2D*2/(FWHM*np.pi*(4/FWHM**2*(freq2D-energies2D)**2+1)) spectrum = np.sum(int_matrix, axis=1) return spectrum
[docs]def HREEL_2_scaledIR(HREEL, frequency_range=None, PEAK_CONV = 2.7 ): """Summary goes on one line here Parameters ---------- HREEL : numpy.ndarray HREEL spectra that is of size (2,n) where n is the number of points the into which the frequency and intensity are discretized. frequency_range : numpy.ndarray Frequency range onto which HREEL will be interpolated after conversion to IR. PEAK_CONV : float The intensity is scaled by the wavenumber raised to PEAK_CONV. IR_scaled ------- numpy.ndarray : numpy.ndarray The IR spectra the HREELs is converted to. """ if frequency_range is None: frequency_range = np.linspace(200,2200,num=501,endpoint=True) IR = np.interp(frequency_range, HREEL[0], HREEL[1]*HREEL[0]**PEAK_CONV, left=None, right=None, period=None) IR_scaled = IR/np.max(IR) return IR_scaled