From 380cdc85e4c2bda50c8223689af45176945cab00 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 15 Nov 2013 16:32:16 -0800 Subject: [PATCH 01/23] Create indices of water atoms --- scripts/CreateAtomIndices.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/CreateAtomIndices.py b/scripts/CreateAtomIndices.py index 98956932..3ee77d35 100755 --- a/scripts/CreateAtomIndices.py +++ b/scripts/CreateAtomIndices.py @@ -25,21 +25,25 @@ logger = logging.getLogger('msmbuilder.scripts.CreateAtomIndices') + parser = arglib.ArgumentParser( description="Creates an atom indices file for RMSD from a PDB.") parser.add_argument('pdb') parser.add_argument('output', default='AtomIndices.dat') parser.add_argument('atom_type', help='''Atoms to include in index file. + One of four options: (1) minimal (CA, CB, C, N, O, recommended), (2) heavy, - (3) alpha (carbons), or (4) all. Use "all" in cases where protein + (3) alpha (carbons), (4) water, or (5) all. Use "all" in cases where protein nomenclature may be inapproprate, although you may want to define your own indices in such situations. Note that "heavy" keeps all heavy atoms that are not symmetry equivalent. By symmetry equivalent, we mean atoms identical under an exchange of labels. For example, heavy will exclude - the two pairs of equivalent carbons (CD, CE) in a PHE ring. - Note that AtomIndices.dat should be zero-indexed--that is, a 0 + the two pairs of equivalent carbons (CD, CE) in a PHE ring. + Note that AtomIndices.dat should be zero-indexed--that is, a 0 in AtomIndices.dat corresponds to the first atom in your PDB''', - choices=['minimal', 'heavy', 'alpha', 'all'], default='minimal') + choices=['minimal', 'heavy', 'alpha', 'all', 'water'], + default='minimal') + def run(PDBfn, atomtype): @@ -122,6 +126,7 @@ def run(PDBfn, atomtype): "CNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "NNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "SOL": [], + "HOH": [], "Cl-": [], "Na+": [] } @@ -135,6 +140,12 @@ def run(PDBfn, atomtype): elif atomtype == 'alpha': for key in toKeepDict.keys(): toKeepDict[key] = ["CA"] + elif atomtype == 'water': + for key in toKeepDict.keys(): + if key == "SOL" or key == 'HOH': + toKeepDict[key] = ["O"] + else: + toKeepDict[key] = [] elif atomtype == "all": pass else: From 40c440c3a5e9973baf09dc59dcf1ecccacf19ef0 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 15 Nov 2013 16:32:32 -0800 Subject: [PATCH 02/23] Initial commit of solvent fingerprint metric --- src/python/metrics/solvent.py | 94 +++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/python/metrics/solvent.py diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py new file mode 100644 index 00000000..936c8c33 --- /dev/null +++ b/src/python/metrics/solvent.py @@ -0,0 +1,94 @@ +import logging +logger = logging.getLogger(__name__) +import numpy as np +from baseclasses import Vectorized + +class SolventFp(Vectorized): + """Distance metric for calculating distances between frames based on their + solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8. + """ + + def __init__(self, solute_indices, solvent_indices, sigma, + metric='euclidean', p=2, V=None, VI=None): + """Create a distance metric to capture solvent degrees of freedom + + Parameters + ---------- + solute_indices : ndarray + atom indices of the solute atoms + solvent_indices : ndarray + atom indices of the solvent atoms + sigma : float + width of gaussian kernel + metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', + 'correlation', 'cosine', 'euclidean', 'minkowski', + 'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'} + Distance metric to equip the vector space with. + p : int, optional + p-norm order, used for metric='minkowski' + V : ndarray, optional + variances, used for metric='seuclidean' + VI : ndarray, optional + inverse covariance matrix, used for metric='mahalanobi' + + """ + _check_indices(solute_indices, 'Solute') + _check_indices(solvent_indices, 'Solvent') + + super(SolventFp, self).__init__(metric, p, V, VI) + self.solute_indices = solute_indices + self.solvent_indices = solvent_indices + self.sigma = sigma + + def __repr__(self): + "String representation of the object" + return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)' + % (self.metric, self.p, self.sigma)) + + def prepare_trajectory(self, trajectory): + """lalalala + Parameters + ---------- + trajectory : msmbuilder.Trajectory + An MSMBuilder trajectory to prepare + + Returns + ------- + lalal : ndarray + A 2D array of lalala + """ + + # Give shorter names to these things + prot_indices = self.solute_indices + water_indices = self.solvent_indices + sigma = self.sigma + traj = trajectory['XYZList'] + + # The result vector + fingerprints = np.zeros(len(prot_indices)) + + for i, prot_i in enumerate(prot_indices): + for water_j in water_indices: + fingerprints[i] += _kernel(traj[prot_i], traj[water_j], sigma) + + return fingerprints + +def _kernel(x, y, sigma): + """Gaussian kernel K(x, y).""" + diff = x - y + dot = np.dot(diff, diff) + return np.exp(-dot / (2.0 * sigma * sigma)) + +def _check_indices(indices, name): + """Validate input indices.""" + if indices is not None: + if not isinstance(indices, np.ndarray): + raise ValueError('%s indices must be a numpy array' % name) + if not indices.ndim == 1: + raise ValueError('%s indices must be 1D' % name) + if not indices.dtype == np.int: + raise ValueError('%s indices must contain ints' % name) + else: + raise ValueError('%s indices must be specified' % name) + + From b57b2d84466823068c1f5514b33eb73b87e7431a Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 15 Nov 2013 17:01:35 -0800 Subject: [PATCH 03/23] Include the metric in command line --- src/python/metrics/__init__.py | 1 + src/python/metrics/parsers.py | 92 +++++++++++++++++++++------------- src/python/metrics/solvent.py | 5 ++ 3 files changed, 63 insertions(+), 35 deletions(-) diff --git a/src/python/metrics/__init__.py b/src/python/metrics/__init__.py index 6d860cd0..70007637 100644 --- a/src/python/metrics/__init__.py +++ b/src/python/metrics/__init__.py @@ -19,3 +19,4 @@ from hybrid import Hybrid, HybridPNorm from projection import RedDimPNorm from positions import Positions +from solvent import SolventFp diff --git a/src/python/metrics/parsers.py b/src/python/metrics/parsers.py index 24c68ae3..4cd3856d 100644 --- a/src/python/metrics/parsers.py +++ b/src/python/metrics/parsers.py @@ -9,7 +9,7 @@ from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact, AtomPairs, ContinuousContact, AbstractDistanceMetric, Vectorized, - RedDimPNorm, Positions) + RedDimPNorm, Positions, SolventFp) def add_argument(group, *args, **kwargs): if 'default' in kwargs: @@ -33,14 +33,14 @@ def add_basic_metric_parsers(metric_subparser): metric_parser_list = [] - #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') + # metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') rmsd = metric_subparser.add_parser('rmsd', description='''RMSD: Root mean square deviation over a set of user defined atoms (typically backbone heavy atoms or alpha carbons). To evaluate the distance between two structures, first they are rotated and translated with respect to one another to achieve maximum coincidence. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') - add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', + add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', default='AtomIndices.dat') metric_parser_list.append(rmsd) @@ -52,7 +52,7 @@ def add_basic_metric_parsers(metric_subparser): frames are computed by taking distances between these vectors in R^n. The euclidean distance is recommended, but other distance metrics are available (cityblock, etc). This code is executed in parallel on multiple cores (but - not multiple boxes) using OMP. ''') + not multiple boxes) using OMP. ''') add_argument(dihedral, '-a', dest='dihedral_angles', default='phi/psi', help='which dihedrals. Choose from phi, psi, chi (to choose multiple, seperate them with a slash), or user') add_argument(dihedral, '-f', dest='dihedral_userfilename', default='DihedralIndices.dat', help='filename for dihedral indices, N lines of 4 space-separated indices (otherwise ignored)') @@ -72,22 +72,22 @@ def add_basic_metric_parsers(metric_subparser): Furthermore, the sense in which the distance between two residues is computed can be either specified as "CA", "closest", or "closest-heavy", which will respectively compute ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of - atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), + atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to one residue and j to the other residue. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') add_argument(contact, '-c', dest='contact_which', default='all', help='Path to file containing 2D array of the contacts you want, or the string "all".') add_argument(contact, '-C', dest='contact_cutoff', default=0.5, help='Cutoff distance in nanometers. If you pass -1, then the contact "map" will be a matrix of residue-residue distances. Passing a number greater than 0 means the residue-residue distance matrix will be converted to a boolean matrix, one if the distance is less than the specified cutoff') - add_argument(contact, '-f', dest='contact_cutoff_file', help='File containing residue specific cutoff distances (supercedes the scalar cutoff distance if present).',default=None) + add_argument(contact, '-f', dest='contact_cutoff_file', help='File containing residue specific cutoff distances (supercedes the scalar cutoff distance if present).', default=None) add_argument(contact, '-s', dest='contact_scheme', default='closest-heavy', help='contact scheme.', choices=['CA', 'closest', 'closest-heavy']) metric_parser_list.append(contact) - atompairs = metric_subparser.add_parser('atompairs',description='''ATOMPAIRS: For each frame, we + atompairs = metric_subparser.add_parser('atompairs', description='''ATOMPAIRS: For each frame, we represent the conformation as a vector of particular atom-atom distances. Then the distance between frames is calculated using a specified norm on these vectors. This code is executed in - parallel (but not multiple boxes) using OMP.''') + parallel (but not multiple boxes) using OMP.''') add_argument(atompairs, '-a', dest='atompairs_which', help='path to file with 2D array of which atompairs to use.', default='AtomPairs.dat') add_argument(atompairs, '-p', dest='atompairs_p', default=2, help='p used for metric=minkowski (otherwise ignored)') @@ -95,6 +95,19 @@ def add_basic_metric_parsers(metric_subparser): help='which distance metric', choices=AtomPairs.allowable_scipy_metrics) metric_parser_list.append(atompairs) + solventfp = metric_subparser.add_parser('solventfp', description='''SOLVENTFP: Get + the solvent fingerprint.''') + add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat', + help='Indices of the solvent (water) atoms') + add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat', + help='Indices of the solute (protein) atoms') + add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5, + help='std. dev. of gaussian kernel') + add_argument(solventfp, '-p', dest='solventfp_p', default=2, help='p used for metric=minkowski (otherwise ignored)') + add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean', + help='which distance metric', choices=SolventFp.allowable_scipy_metrics) + metric_parser_list.append(solventfp) + positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame we represent the conformation as a vector of atom positions, where the atoms have been aligned to a target structure.""") @@ -113,36 +126,36 @@ def add_basic_metric_parsers(metric_subparser): add_argument(picklemetric, '-i', dest='picklemetric_input', required=True, help="Path to pickle file for the metric") metric_parser_list.append(picklemetric) - + for add_parser in locate_metric_plugins('add_metric_parser'): plugin_metric_parser = add_parser(metric_subparser, add_argument) metric_parser_list.append(plugin_metric_parser) - + return metric_parser_list ################################################################################ def add_layer_metric_parsers(metric_subparser): - tica = metric_subparser.add_parser( 'tica', description=''' + tica = metric_subparser.add_parser('tica', description=''' tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f. in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.) - for more details. In addition to these options, you must provide an additional + for more details. In addition to these options, you must provide an additional metric you used to prepare the trajectories in the training step.''') required = tica.add_argument_group('required') choose_one = tica.add_argument_group('selecting projection vectors (choose_one)') - add_argument(tica,'-p', dest='p', help='p value for p-norm') - add_argument(tica,'-m', dest='projected_metric', help='metric to use in the projected space', + add_argument(tica, '-p', dest='p', help='p value for p-norm') + add_argument(tica, '-m', dest='projected_metric', help='metric to use in the projected space', choices=Vectorized.allowable_scipy_metrics, default='euclidean') - add_argument(required, '-f', dest='tica_fn', + add_argument(required, '-f', dest='tica_fn', help='tICA Object which was prepared by tICA_train.py') add_argument(choose_one, '-n', dest='num_vecs', type=int, help='Choose the top <-n> eigenvectors based on their eigenvalues') tica.metric_parser_list = [] - tica_subparsers = tica.add_subparsers(dest='sub_metric', description=''' - Available metrics to use in preparing the trajectory before projecting.''' ) + tica_subparsers = tica.add_subparsers(dest='sub_metric', description=''' + Available metrics to use in preparing the trajectory before projecting.''') tica.metric_parser_list = add_basic_metric_parsers(tica_subparsers) return tica.metric_parser_list @@ -151,7 +164,7 @@ def add_metric_parsers(parser, add_layer_metrics=False): metric_parser_list = [] - metric_subparser = parser.add_subparsers(dest='metric', description ='Available metrics to use.') + metric_subparser = parser.add_subparsers(dest='metric', description='Available metrics to use.') if add_layer_metrics: metric_parser_list.extend(add_layer_metric_parsers(metric_subparser)) @@ -166,33 +179,33 @@ def construct_basic_metric(metric_name, args): atom_indices = np.loadtxt(args.rmsd_atom_indices, np.int) else: atom_indices = None - metric = RMSD(atom_indices)#, omp_parallel=args.rmsd_omp_parallel) + metric = RMSD(atom_indices) # , omp_parallel=args.rmsd_omp_parallel) elif metric_name == 'dihedral': metric = Dihedral(metric=args.dihedral_metric, p=args.dihedral_p, angles=args.dihedral_angles, userfilename=args.dihedral_userfilename) - + elif metric_name == 'contact': if args.contact_which != 'all': contact_which = np.loadtxt(args.contact_which, np.int) else: contact_which = 'all' - if args.contact_cutoff_file != None: - contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) + if args.contact_cutoff_file != None: + contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) elif args.contact_cutoff != None: - contact_cutoff = float( args.contact_cutoff ) + contact_cutoff = float(args.contact_cutoff) else: contact_cutoff = None - + if contact_cutoff != None and contact_cutoff < 0: metric = ContinuousContact(contacts=contact_which, scheme=args.contact_scheme) else: metric = BooleanContact(contacts=contact_which, cutoff=contact_cutoff, scheme=args.contact_scheme) - + elif metric_name == 'atompairs': if args.atompairs_which != None: pairs = np.loadtxt(args.atompairs_which, np.int) @@ -203,8 +216,8 @@ def construct_basic_metric(metric_name, args): atom_pairs=pairs) elif metric_name == 'positions': - target = md.load(args.target) - + target = md.load(args.target) + if args.pos_atom_indices != None: atom_indices = np.loadtxt(args.pos_atom_indices, np.int) else: @@ -214,17 +227,26 @@ def construct_basic_metric(metric_name, args): align_indices = np.loadtxt(args.align_indices, np.int) else: align_indices = None - + metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices, metric=args.positions_metric, p=args.positions_p) - + + elif metric_name == 'solventfp': + proti = np.loadtxt(args.solventfp_proti, np.int) + wateri = np.loadtxt(args.solventfp_wateri, np.int) + metric = SolventFp(solute_indices=proti, + solvent_indices=wateri, + sigma=args.solventfp_sigma, + metric=args.solventfp_metric, + p=args.solventfp_p) + elif metric_name == 'custom': with open(args.picklemetric_input) as f: metric = pickle.load(f) - print '#'*80 + print '#' * 80 print 'Loaded custom metric:' print metric - print '#'*80 + print '#' * 80 else: # apply the constructor on args and take the first non-none element # note that using these itertools constructs, we'll only actual @@ -235,7 +257,7 @@ def construct_basic_metric(metric_name, args): except StopIteration: # This means that none of the plugins acceptedthe metric raise RuntimeError("Bad metric. Could not be constructed by any built-in or plugin metric. Perhaps you have a poorly written plugin?") - + if not isinstance(metric, AbstractDistanceMetric): return ValueError("%s is not a AbstractDistanceMetric" % metric) @@ -245,17 +267,17 @@ def construct_basic_metric(metric_name, args): def construct_layer_metric(metric_name, args): if metric_name == 'tica': sub_metric = construct_basic_metric(args.sub_metric, args) - + tica_obj = tICA.load(args.tica_fn, sub_metric) - return RedDimPNorm(tica_obj, num_vecs=args.num_vecs, + return RedDimPNorm(tica_obj, num_vecs=args.num_vecs, metric=args.projected_metric, p=args.p) else: raise Exception("do not know how to construct metric (%s)") -def construct_metric( args ): +def construct_metric(args): if hasattr(args, 'sub_metric'): return construct_layer_metric(args.metric, args) else: diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 936c8c33..8ca22329 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -8,6 +8,11 @@ class SolventFp(Vectorized): solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8. """ + allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev', + 'cityblock', 'correlation', 'cosine', + 'euclidean', 'minkowski', 'sqeuclidean', + 'seuclidean', 'mahalanobis', 'sqmahalanobis'] + def __init__(self, solute_indices, solvent_indices, sigma, metric='euclidean', p=2, V=None, VI=None): """Create a distance metric to capture solvent degrees of freedom From 30914ad8b3a8d02dbc9aea69785e62d279975392 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 15 Nov 2013 19:33:44 -0800 Subject: [PATCH 04/23] Added outer product assignment --- scripts/AssignOuter.py | 49 +++++++++++++++++++++++++++++ src/python/metrics/solvent.py | 59 ++++++++++++++++++++++++++++++++--- 2 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 scripts/AssignOuter.py diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py new file mode 100644 index 00000000..94b627a4 --- /dev/null +++ b/scripts/AssignOuter.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# This file is part of MSMBuilder. +# +# Copyright 2011 Stanford University +# +# MSMBuilder is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +from msmbuilder import arglib, io +from msmbuilder.metrics import solvent +import logging +logger = logging.getLogger('msmbuilder.scripts.AssignOuter') + +parser = arglib.ArgumentParser(description='''Assign data to the outer product + space of two preliminary + assignments''') +parser.add_argument('assignment1', default='./Data1/Assignments.h5', + help='First assignment file') +parser.add_argument('assignment2', default='./Data2/Assignments.h5', + help='Second assignment file') +parser.add_argument('assignment_out', default='OuterProductAssignments.h5', + help='Output file') + + +def main(ass1_fn, ass2_fn): + hierarchical = Hierarchical.load_from_disk(zmatrix_fn) + assignments = hierarchical.get_assignments(k=k, cutoff_distance=d) + return assignments + +if __name__ == "__main__": + args = parser.parse_args() + arglib.die_if_path_exists(args.assignment_out) + + opa = solvent.OuterProductAssignment(args.assignment1, args.assignment2) + new_assignments = opa.get_product_assignments() + io.saveh(args.assignment_out, new_assignments) + logger.info('Saved outer product assignments to %s', args.assignment_out) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 8ca22329..afc7242a 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -2,6 +2,7 @@ logger = logging.getLogger(__name__) import numpy as np from baseclasses import Vectorized +from msmbuilder import io class SolventFp(Vectorized): """Distance metric for calculating distances between frames based on their @@ -70,11 +71,12 @@ def prepare_trajectory(self, trajectory): traj = trajectory['XYZList'] # The result vector - fingerprints = np.zeros(len(prot_indices)) - - for i, prot_i in enumerate(prot_indices): - for water_j in water_indices: - fingerprints[i] += _kernel(traj[prot_i], traj[water_j], sigma) + fingerprints = np.zeros((len(traj), len(prot_indices))) + + for frame_t in xrange(len(traj)): + for i, prot_i in enumerate(prot_indices): + for water_j in water_indices: + fingerprints[frame_t, i] += _kernel(traj[frame_t][prot_i], traj[frame_t][water_j], sigma) return fingerprints @@ -96,4 +98,51 @@ def _check_indices(indices, name): else: raise ValueError('%s indices must be specified' % name) +class OuterProductAssignment(object): + """Class to facilitate taking the outer product of the result of + two clusterings.""" + + def __init__(self, ass1_fn, ass2_fn): + """Create the object + + the ass_fn's should point to an hdf5 assignments file. + """ + self.ass1 = io.loadh(ass1_fn, 'arr_0') + self.ass2 = io.loadh(ass2_fn, 'arr_0') + + def get_product_assignments(self): + assert self.ass1.shape == self.ass2.shape, """Assignments must be + for the same set of trajectories.""" + new_ass = -1 * np.ones_like(self.ass1, dtype=np.int) + + nstates1 = np.max(self.ass1) + 1 + nstates2 = np.max(self.ass2) + 1 + + translations = np.reshape(np.arange(nstates1 * nstates2), + (nstates1, nstates2)) + + ass_shape = self.ass1.shape + for i in xrange(ass_shape[0]): + for j in xrange(ass_shape[1]): + if self.ass1[i, j] == -1: + # No assignment here + assert self.ass2[i, j] == -1, """Assignments must be for + the same set of trajectories.""" + + new_ass[i, j] = translations[self.ass1[i, j], self.ass2[i, j]] + + return new_ass + + + + + + + + + + + + + From 736deca3b9ede6e82da87f8ba3dff687325f5b92 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Wed, 20 Nov 2013 15:12:12 -0800 Subject: [PATCH 05/23] Added parser type=float to fix command line sigma setting --- src/python/metrics/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/metrics/parsers.py b/src/python/metrics/parsers.py index 4cd3856d..796a66cb 100644 --- a/src/python/metrics/parsers.py +++ b/src/python/metrics/parsers.py @@ -102,7 +102,7 @@ def add_basic_metric_parsers(metric_subparser): add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat', help='Indices of the solute (protein) atoms') add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5, - help='std. dev. of gaussian kernel') + type=float, help='std. dev. of gaussian kernel') add_argument(solventfp, '-p', dest='solventfp_p', default=2, help='p used for metric=minkowski (otherwise ignored)') add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean', help='which distance metric', choices=SolventFp.allowable_scipy_metrics) From 7c5e3a19b4f8a37ed90e72a8d5b17a612dde99cb Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Thu, 21 Nov 2013 18:09:52 -0800 Subject: [PATCH 06/23] Changed to contact.atom_distance for calculation. Also this has the effect of removing triple nested for loop to one loop --- src/python/metrics/solvent.py | 43 +++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index afc7242a..90a56a94 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -1,8 +1,13 @@ import logging -logger = logging.getLogger(__name__) -import numpy as np -from baseclasses import Vectorized + from msmbuilder import io +from msmbuilder.geometry import contact + +from baseclasses import Vectorized +import numpy as np + + +logger = logging.getLogger(__name__) class SolventFp(Vectorized): """Distance metric for calculating distances between frames based on their @@ -52,7 +57,7 @@ def __repr__(self): % (self.metric, self.p, self.sigma)) def prepare_trajectory(self, trajectory): - """lalalala + """Calculate solvent fingerprints Parameters ---------- trajectory : msmbuilder.Trajectory @@ -60,8 +65,9 @@ def prepare_trajectory(self, trajectory): Returns ------- - lalal : ndarray - A 2D array of lalala + fingerprints : ndarray + A 2D array of fingerprint vectors of + shape (traj_length, protein_atom) """ # Give shorter names to these things @@ -72,19 +78,22 @@ def prepare_trajectory(self, trajectory): # The result vector fingerprints = np.zeros((len(traj), len(prot_indices))) - - for frame_t in xrange(len(traj)): - for i, prot_i in enumerate(prot_indices): - for water_j in water_indices: - fingerprints[frame_t, i] += _kernel(traj[frame_t][prot_i], traj[frame_t][water_j], sigma) - return fingerprints + for i, prot_i in enumerate(prot_indices): + # For each protein atom, calculate distance to all water + # molecules + atom_contacts = np.empty((len(water_indices), 2)) + atom_contacts[:, 0] = prot_i + atom_contacts[:, 1] = water_indices + # Get a traj_length x n_water_indices vector of distances + distances = contact.atom_distances(traj, atom_contacts) + # Calculate guassian kernel + distances = np.exp(-distances / (2 * sigma * sigma)) + + # Sum over water atoms for all frames + fingerprints[:, i] = np.sum(distances, axis=1) -def _kernel(x, y, sigma): - """Gaussian kernel K(x, y).""" - diff = x - y - dot = np.dot(diff, diff) - return np.exp(-dot / (2.0 * sigma * sigma)) + return fingerprints def _check_indices(indices, name): """Validate input indices.""" From 71447930aedbad6bd12c60d8a8dde7aa9cf23ee3 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Thu, 12 Dec 2013 15:34:40 -0800 Subject: [PATCH 07/23] Fixed error with unequal length trajectories It wasn't giving them -1 --- src/python/metrics/solvent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 90a56a94..68ae6b56 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -137,8 +137,8 @@ def get_product_assignments(self): # No assignment here assert self.ass2[i, j] == -1, """Assignments must be for the same set of trajectories.""" - - new_ass[i, j] = translations[self.ass1[i, j], self.ass2[i, j]] + else: + new_ass[i, j] = translations[self.ass1[i, j], self.ass2[i, j]] return new_ass From ed6600a54e7570367578bd2588f0045229ab3a88 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Thu, 9 Jan 2014 14:01:55 -0800 Subject: [PATCH 08/23] Use mdtraj distance calculations --- src/python/metrics/solvent.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 68ae6b56..75b86a4c 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -1,7 +1,7 @@ import logging from msmbuilder import io -from msmbuilder.geometry import contact +import mdtraj as md from baseclasses import Vectorized import numpy as np @@ -82,11 +82,11 @@ def prepare_trajectory(self, trajectory): for i, prot_i in enumerate(prot_indices): # For each protein atom, calculate distance to all water # molecules - atom_contacts = np.empty((len(water_indices), 2)) - atom_contacts[:, 0] = prot_i - atom_contacts[:, 1] = water_indices + atom_pairs = np.empty((len(water_indices), 2)) + atom_pairs[:, 0] = prot_i + atom_pairs[:, 1] = water_indices # Get a traj_length x n_water_indices vector of distances - distances = contact.atom_distances(traj, atom_contacts) + distances = md.compute_distances(traj, atom_pairs, periodic=True) # Calculate guassian kernel distances = np.exp(-distances / (2 * sigma * sigma)) From d0c5aa7186462fb9282711bd86719ac6f9b70c83 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Thu, 9 Jan 2014 17:29:39 -0800 Subject: [PATCH 09/23] Fix solvent fingerprint metric to use mdtraj trajectories --- src/python/metrics/solvent.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 75b86a4c..529790d6 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -74,10 +74,9 @@ def prepare_trajectory(self, trajectory): prot_indices = self.solute_indices water_indices = self.solvent_indices sigma = self.sigma - traj = trajectory['XYZList'] # The result vector - fingerprints = np.zeros((len(traj), len(prot_indices))) + fingerprints = np.zeros((trajectory.n_frames, len(prot_indices))) for i, prot_i in enumerate(prot_indices): # For each protein atom, calculate distance to all water @@ -86,7 +85,7 @@ def prepare_trajectory(self, trajectory): atom_pairs[:, 0] = prot_i atom_pairs[:, 1] = water_indices # Get a traj_length x n_water_indices vector of distances - distances = md.compute_distances(traj, atom_pairs, periodic=True) + distances = md.compute_distances(trajectory, atom_pairs, periodic=True) # Calculate guassian kernel distances = np.exp(-distances / (2 * sigma * sigma)) From fd806371d6393f2d4b1db5e3a7427ba8a4f8de05 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Thu, 9 Jan 2014 18:11:16 -0800 Subject: [PATCH 10/23] formatting --- src/python/metrics/solvent.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 529790d6..2dac552f 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -94,6 +94,7 @@ def prepare_trajectory(self, trajectory): return fingerprints + def _check_indices(indices, name): """Validate input indices.""" if indices is not None: @@ -106,6 +107,7 @@ def _check_indices(indices, name): else: raise ValueError('%s indices must be specified' % name) + class OuterProductAssignment(object): """Class to facilitate taking the outer product of the result of two clusterings.""" From 0de3f02ef9234576a6f4e8eaa4a2b70c7982d050 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Mon, 13 Jan 2014 15:16:34 -0800 Subject: [PATCH 11/23] Bugfix --- src/python/metrics/solvent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 2dac552f..5608271c 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -1,7 +1,7 @@ import logging -from msmbuilder import io import mdtraj as md +from mdtraj import io from baseclasses import Vectorized import numpy as np From 398b3e1860219fb543ed3b54c532a23eb8b5720f Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Mon, 13 Jan 2014 15:20:05 -0800 Subject: [PATCH 12/23] Formatting --- src/python/metrics/solvent.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 5608271c..706f64f3 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) + class SolventFp(Vectorized): """Distance metric for calculating distances between frames based on their solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8. @@ -85,7 +86,9 @@ def prepare_trajectory(self, trajectory): atom_pairs[:, 0] = prot_i atom_pairs[:, 1] = water_indices # Get a traj_length x n_water_indices vector of distances - distances = md.compute_distances(trajectory, atom_pairs, periodic=True) + distances = md.compute_distances(trajectory, + atom_pairs, + periodic=True) # Calculate guassian kernel distances = np.exp(-distances / (2 * sigma * sigma)) From a3f866ef9d62f50813f0e057d97126bde2a99179 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Mon, 27 Jan 2014 17:53:39 -0800 Subject: [PATCH 13/23] use mdtraj.io.loadh instead of msmbuilder --- scripts/AssignOuter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py index 94b627a4..7e0d4997 100644 --- a/scripts/AssignOuter.py +++ b/scripts/AssignOuter.py @@ -18,7 +18,8 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -from msmbuilder import arglib, io +from msmbuilder import arglib +from mdtraj import io from msmbuilder.metrics import solvent import logging logger = logging.getLogger('msmbuilder.scripts.AssignOuter') From f5012394547505733d40bf5ec269e3e42c70eba6 Mon Sep 17 00:00:00 2001 From: Lili Peng Date: Fri, 14 Feb 2014 15:04:12 -0800 Subject: [PATCH 14/23] Merge the SaveStructures.py script from master --- scripts/SaveStructures.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/SaveStructures.py b/scripts/SaveStructures.py index 898e56e7..898a3f98 100644 --- a/scripts/SaveStructures.py +++ b/scripts/SaveStructures.py @@ -26,10 +26,10 @@ default=[-1]) # nargs=+ means the values will come out as type list, but this isn't # naturally applied to the default, so we just put the default as [-1] -parser.add_argument('format', choices=['pdb', 'xtc', 'lh5'], +parser.add_argument('format', choices=['pdb', 'xtc', 'h5'], help='''Format for the outputted conformations. PDB is the standard plaintext protein databank format. XTC is the gromacs binary trajectory - format, and lh5 is the MSMBuilder standard hdf5 based format''', + format, and h5 is the MDTraj hdf format''', default='pdb') parser.add_argument('style', choices=['sep', 'tps', 'one'], help='''Controls the number of conformations save per file. If "sep" (SEPARATE), all of @@ -99,9 +99,8 @@ def main(): # states if -1 in args.states: - n_states = len(np.unique(assignments[np.where(assignments != -1)])) - logger.info('Yanking from all %d states', n_states) - states = np.arange(n_states) + states = np.unique(assigments[np.where(assignments != -1)]) + logger.info('Yanking from all %d states', len(states)) else: # ensure that the states are sorted, and that they're unique -- you # can only request each state once @@ -109,9 +108,9 @@ def main(): logger.info("Yanking from the following states: %s", states) # extract the conformations using np.random for the randomness - confs_by_state = project.get_random_confs_from_states(assignments, - states=states, num_confs=args.conformations_per_state, - replacement=args.replacement) + confs_by_state = project.get_random_confs_from_states( + assignments, states=states, num_confs=args.conformations_per_state, + replacement=args.replacement) # save the conformations to disk, in the requested style save(confs_by_state=confs_by_state, states=states, style=args.style, From 2d6278a719f0e897c361af0bd0a7e235d621121a Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 17:51:17 -0800 Subject: [PATCH 15/23] Typo --- scripts/SaveStructures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/SaveStructures.py b/scripts/SaveStructures.py index 898a3f98..c459b5d9 100644 --- a/scripts/SaveStructures.py +++ b/scripts/SaveStructures.py @@ -99,7 +99,7 @@ def main(): # states if -1 in args.states: - states = np.unique(assigments[np.where(assignments != -1)]) + states = np.unique(assignments[np.where(assignments != -1)]) logger.info('Yanking from all %d states', len(states)) else: # ensure that the states are sorted, and that they're unique -- you From c539a48eaee58d3d0230ca255bc26342519da685 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 18:25:20 -0800 Subject: [PATCH 16/23] Get rid of extraneous #####'s from merge --- src/python/metrics/parsers.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/python/metrics/parsers.py b/src/python/metrics/parsers.py index 32cb77d4..c4e865ce 100644 --- a/src/python/metrics/parsers.py +++ b/src/python/metrics/parsers.py @@ -20,7 +20,6 @@ def add_argument(group, *args, **kwargs): kwargs['help'] = d group.add_argument(*args, **kwargs) -################################################################################ def locate_metric_plugins(name): if not name in ['add_metric_parser', 'construct_metric', 'metric_class']: @@ -32,7 +31,7 @@ def locate_metric_plugins(name): def add_metric_parsers(parser): metric_parser_list = [] - + metric_subparser = parser.add_subparsers(dest='metric', description ='Available metrics to use.') #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') @@ -42,7 +41,7 @@ def add_metric_parsers(parser): between two structures, first they are rotated and translated with respect to one another to achieve maximum coincidence. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') - add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', + add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', default='AtomIndices.dat') metric_parser_list.append(rmsd) @@ -54,7 +53,7 @@ def add_metric_parsers(parser): frames are computed by taking distances between these vectors in R^n. The euclidean distance is recommended, but other distance metrics are available (cityblock, etc). This code is executed in parallel on multiple cores (but - not multiple boxes) using OMP. ''') + not multiple boxes) using OMP. ''') add_argument(dihedral, '-a', dest='dihedral_angles', default='phi/psi', help='which dihedrals. Choose from phi, psi, chi (to choose multiple, seperate them with a slash), or user') add_argument(dihedral, '-f', dest='dihedral_userfilename', default='DihedralIndices.dat', help='filename for dihedral indices, N lines of 4 space-separated indices (otherwise ignored)') @@ -74,7 +73,7 @@ def add_metric_parsers(parser): Furthermore, the sense in which the distance between two residues is computed can be either specified as "CA", "closest", or "closest-heavy", which will respectively compute ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of - atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), + atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to one residue and j to the other residue. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') @@ -89,7 +88,7 @@ def add_metric_parsers(parser): atompairs = metric_subparser.add_parser('atompairs',description='''ATOMPAIRS: For each frame, we represent the conformation as a vector of particular atom-atom distances. Then the distance between frames is calculated using a specified norm on these vectors. This code is executed in - parallel (but not multiple boxes) using OMP.''') + parallel (but not multiple boxes) using OMP.''') add_argument(atompairs, '-a', dest='atompairs_which', help='path to file with 2D array of which atompairs to use.', default='AtomPairs.dat') add_argument(atompairs, '-p', dest='atompairs_p', default=2, help='p used for metric=minkowski (otherwise ignored)') @@ -124,13 +123,13 @@ def add_metric_parsers(parser): tica = metric_subparser.add_parser( 'tica', description=''' tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f. in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.) - for more details. In addition to these options, you must provide an additional + for more details. In addition to these options, you must provide an additional metric you used to prepare the trajectories in the training step.''') add_argument(tica,'-p', dest='p', help='p value for p-norm') add_argument(tica,'-m', dest='projected_metric', help='metric to use in the projected space', choices=Vectorized.allowable_scipy_metrics, default='euclidean') - add_argument(tica, '-f', dest='tica_fn', + add_argument(tica, '-f', dest='tica_fn', help='tICA Object which was prepared by tICA_train.py') add_argument(tica, '-n', dest='num_vecs', type=int, help='Choose the top <-n> eigenvectors based on their eigenvalues') @@ -150,8 +149,6 @@ def add_metric_parsers(parser): return metric_parser_list - ################################################################################ - def construct_metric(args): metric_name = args.metric @@ -167,27 +164,27 @@ def construct_metric(args): metric = Dihedral(metric=args.dihedral_metric, p=args.dihedral_p, angles=args.dihedral_angles, userfilename=args.dihedral_userfilename) - + elif metric_name == 'contact': if args.contact_which != 'all': contact_which = np.loadtxt(args.contact_which, np.int) else: contact_which = 'all' - if args.contact_cutoff_file != None: - contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) + if args.contact_cutoff_file != None: + contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) elif args.contact_cutoff != None: contact_cutoff = float( args.contact_cutoff ) else: contact_cutoff = None - + if contact_cutoff != None and contact_cutoff < 0: metric = ContinuousContact(contacts=contact_which, scheme=args.contact_scheme) else: metric = BooleanContact(contacts=contact_which, cutoff=contact_cutoff, scheme=args.contact_scheme) - + elif metric_name == 'atompairs': if args.atompairs_which != None: pairs = np.loadtxt(args.atompairs_which, np.int) @@ -199,7 +196,7 @@ def construct_metric(args): elif metric_name == 'positions': target = md.load(args.target) - + if args.pos_atom_indices != None: atom_indices = np.loadtxt(args.pos_atom_indices, np.int) else: @@ -209,7 +206,7 @@ def construct_metric(args): align_indices = np.loadtxt(args.align_indices, np.int) else: align_indices = None - + metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices, metric=args.positions_metric, p=args.positions_p) @@ -225,7 +222,7 @@ def construct_metric(args): elif metric_name == "tica": tica_obj = tICA.load(args.tica_fn) - metric = RedDimPNorm(tica_obj, num_vecs=args.num_vecs, + metric = RedDimPNorm(tica_obj, num_vecs=args.num_vecs, metric=args.projected_metric, p=args.p) elif metric_name == 'custom': with open(args.picklemetric_input) as f: @@ -244,7 +241,7 @@ def construct_metric(args): except StopIteration: # This means that none of the plugins acceptedthe metric raise RuntimeError("Bad metric. Could not be constructed by any built-in or plugin metric. Perhaps you have a poorly written plugin?") - + if not isinstance(metric, AbstractDistanceMetric): return ValueError("%s is not a AbstractDistanceMetric" % metric) From d6d0207168905229b109f81b39f513280a40b350 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:06:32 -0800 Subject: [PATCH 17/23] More descriptive, removed main() function, changed date --- scripts/AssignOuter.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py index 7e0d4997..7f78a61c 100644 --- a/scripts/AssignOuter.py +++ b/scripts/AssignOuter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # This file is part of MSMBuilder. # -# Copyright 2011 Stanford University +# Copyright 2014 Stanford University # # MSMBuilder is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -24,9 +24,13 @@ import logging logger = logging.getLogger('msmbuilder.scripts.AssignOuter') -parser = arglib.ArgumentParser(description='''Assign data to the outer product - space of two preliminary - assignments''') +parser = arglib.ArgumentParser(description= + """Combine the results of two clusterings. Specifically, if a conformation + is in state i after clustering with metric 1, and it is in state j + after clustering with metric 2, we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states.""") parser.add_argument('assignment1', default='./Data1/Assignments.h5', help='First assignment file') parser.add_argument('assignment2', default='./Data2/Assignments.h5', @@ -35,11 +39,6 @@ help='Output file') -def main(ass1_fn, ass2_fn): - hierarchical = Hierarchical.load_from_disk(zmatrix_fn) - assignments = hierarchical.get_assignments(k=k, cutoff_distance=d) - return assignments - if __name__ == "__main__": args = parser.parse_args() arglib.die_if_path_exists(args.assignment_out) From 4421d71302a2c7f9919c09e25efb286bdb3cf3ee Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:24:54 -0800 Subject: [PATCH 18/23] Address issues --- scripts/AssignOuter.py | 15 +++++++++---- src/python/metrics/parsers.py | 5 +++-- src/python/metrics/solvent.py | 40 +++++++++++++++++------------------ 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py index 7f78a61c..193b4833 100644 --- a/scripts/AssignOuter.py +++ b/scripts/AssignOuter.py @@ -39,11 +39,18 @@ help='Output file') +def main(assign1_fn, assign2_fn, out_fn): + assign1 = io.loadh(assign1_fn, 'arr_0') + assign2 = io.loadh(assign2_fn, 'arr_0') + + opa = solvent.OuterProductAssignment(assign1, assign2) + new_assignments = opa.get_product_assignments() + io.saveh(out_fn, new_assignments) + logger.info('Saved outer product assignments to %s', out_fn) + + if __name__ == "__main__": args = parser.parse_args() arglib.die_if_path_exists(args.assignment_out) - opa = solvent.OuterProductAssignment(args.assignment1, args.assignment2) - new_assignments = opa.get_product_assignments() - io.saveh(args.assignment_out, new_assignments) - logger.info('Saved outer product assignments to %s', args.assignment_out) + main(args.assignment1, args.assignment2, args.assignment_out) diff --git a/src/python/metrics/parsers.py b/src/python/metrics/parsers.py index c4e865ce..66856ea0 100644 --- a/src/python/metrics/parsers.py +++ b/src/python/metrics/parsers.py @@ -96,8 +96,9 @@ def add_metric_parsers(parser): help='which distance metric', choices=AtomPairs.allowable_scipy_metrics) metric_parser_list.append(atompairs) - solventfp = metric_subparser.add_parser('solventfp', description='''SOLVENTFP: Get - the solvent fingerprint.''') + solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame + calculate a 'solvent fingerprint' by considering the weighted pairwise distances + between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""") add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat', help='Indices of the solvent (water) atoms') add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat', diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 706f64f3..98c8b04b 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -27,9 +27,9 @@ def __init__(self, solute_indices, solvent_indices, sigma, Parameters ---------- solute_indices : ndarray - atom indices of the solute atoms + atom indices of the solute atoms solvent_indices : ndarray - atom indices of the solvent atoms + atom indices of the solvent atoms sigma : float width of gaussian kernel metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', @@ -44,8 +44,12 @@ def __init__(self, solute_indices, solvent_indices, sigma, inverse covariance matrix, used for metric='mahalanobi' """ - _check_indices(solute_indices, 'Solute') - _check_indices(solvent_indices, 'Solvent') + + # Check input indices + md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1, + name='solute', can_be_none=False) + md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1, + name='solvent', can_be_none=False) super(SolventFp, self).__init__(metric, p, V, VI) self.solute_indices = solute_indices @@ -79,6 +83,10 @@ def prepare_trajectory(self, trajectory): # The result vector fingerprints = np.zeros((trajectory.n_frames, len(prot_indices))) + # Check for periodic information + if trajectory.unitcell_lenghts is None: + logging.warn('No periodic information found for computing solventfp.') + for i, prot_i in enumerate(prot_indices): # For each protein atom, calculate distance to all water # molecules @@ -98,27 +106,19 @@ def prepare_trajectory(self, trajectory): return fingerprints -def _check_indices(indices, name): - """Validate input indices.""" - if indices is not None: - if not isinstance(indices, np.ndarray): - raise ValueError('%s indices must be a numpy array' % name) - if not indices.ndim == 1: - raise ValueError('%s indices must be 1D' % name) - if not indices.dtype == np.int: - raise ValueError('%s indices must contain ints' % name) - else: - raise ValueError('%s indices must be specified' % name) - - class OuterProductAssignment(object): """Class to facilitate taking the outer product of the result of two clusterings.""" - def __init__(self, ass1_fn, ass2_fn): - """Create the object + def __init__(self, assign1, assign2): + """Create a distance metric to capture solvent degrees of freedom - the ass_fn's should point to an hdf5 assignments file. + Parameters + ---------- + assign1 : ndarray + assignment array 1 + assign2 : ndarray + assignment array 2 """ self.ass1 = io.loadh(ass1_fn, 'arr_0') self.ass2 = io.loadh(ass2_fn, 'arr_0') From ce9956e3f50ccb5009d988a40bed723e33d66e5c Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:29:03 -0800 Subject: [PATCH 19/23] small typo --- src/python/metrics/solvent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 98c8b04b..89ffead0 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -84,7 +84,7 @@ def prepare_trajectory(self, trajectory): fingerprints = np.zeros((trajectory.n_frames, len(prot_indices))) # Check for periodic information - if trajectory.unitcell_lenghts is None: + if trajectory.unitcell_lengths is None: logging.warn('No periodic information found for computing solventfp.') for i, prot_i in enumerate(prot_indices): From d2bb38ef59e4db8249936b45f6f72d3c09c292fd Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:32:15 -0800 Subject: [PATCH 20/23] small typo --- src/python/metrics/solvent.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index 89ffead0..c35a3bae 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -1,7 +1,6 @@ import logging import mdtraj as md -from mdtraj import io from baseclasses import Vectorized import numpy as np @@ -120,8 +119,8 @@ def __init__(self, assign1, assign2): assign2 : ndarray assignment array 2 """ - self.ass1 = io.loadh(ass1_fn, 'arr_0') - self.ass2 = io.loadh(ass2_fn, 'arr_0') + self.ass1 = assign1 + self.ass2 = assign2 def get_product_assignments(self): assert self.ass1.shape == self.ass2.shape, """Assignments must be From 150b656a73e77af90c6f9a93294f8bdfabfced3b Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:34:20 -0800 Subject: [PATCH 21/23] Be clear that 'water' just means water oxygens --- scripts/CreateAtomIndices.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/CreateAtomIndices.py b/scripts/CreateAtomIndices.py index 3ee77d35..7f51feef 100755 --- a/scripts/CreateAtomIndices.py +++ b/scripts/CreateAtomIndices.py @@ -33,9 +33,10 @@ parser.add_argument('atom_type', help='''Atoms to include in index file. One of four options: (1) minimal (CA, CB, C, N, O, recommended), (2) heavy, - (3) alpha (carbons), (4) water, or (5) all. Use "all" in cases where protein - nomenclature may be inapproprate, although you may want to define your own - indices in such situations. Note that "heavy" keeps all heavy atoms that + (3) alpha (carbons), (4) water (water oxygens), or (5) all. Use "all" + in cases where protein nomenclature may be inapproprate, although you + may want to define your own indices in such situations. + Note that "heavy" keeps all heavy atoms that are not symmetry equivalent. By symmetry equivalent, we mean atoms identical under an exchange of labels. For example, heavy will exclude the two pairs of equivalent carbons (CD, CE) in a PHE ring. @@ -141,6 +142,7 @@ def run(PDBfn, atomtype): for key in toKeepDict.keys(): toKeepDict[key] = ["CA"] elif atomtype == 'water': + # Note: we only keep water oxygens for key in toKeepDict.keys(): if key == "SOL" or key == 'HOH': toKeepDict[key] = ["O"] From 7d11a1fa39b2c53eb4806d80c6a430d24f54af38 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:47:19 -0800 Subject: [PATCH 22/23] Moved outer product assignment code to a function in assigning --- scripts/AssignOuter.py | 5 ++-- src/python/assigning.py | 50 +++++++++++++++++++++++++++++++ src/python/metrics/solvent.py | 55 ----------------------------------- 3 files changed, 52 insertions(+), 58 deletions(-) diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py index 193b4833..657c7e92 100644 --- a/scripts/AssignOuter.py +++ b/scripts/AssignOuter.py @@ -18,7 +18,7 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -from msmbuilder import arglib +from msmbuilder import arglib, assigning from mdtraj import io from msmbuilder.metrics import solvent import logging @@ -43,8 +43,7 @@ def main(assign1_fn, assign2_fn, out_fn): assign1 = io.loadh(assign1_fn, 'arr_0') assign2 = io.loadh(assign2_fn, 'arr_0') - opa = solvent.OuterProductAssignment(assign1, assign2) - new_assignments = opa.get_product_assignments() + new_assignments = assigning.outer_product_assignment(assign1, assign2) io.saveh(out_fn, new_assignments) logger.info('Saved outer product assignments to %s', out_fn) diff --git a/src/python/assigning.py b/src/python/assigning.py index fc458160..68ef807f 100644 --- a/src/python/assigning.py +++ b/src/python/assigning.py @@ -3,6 +3,7 @@ import numpy as np import tables import warnings +from msmbuilder import MSMLib as msmlib from mdtraj import io import logging logger = logging.getLogger(__name__) @@ -236,3 +237,52 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa "-- this function is deprecated"), DeprecationWarning) assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size, atom_indices_to_load) + + +def outer_product_assignment(assign1, assign2): + """Combine the results of two clusterings. + + Specifically, if a conformation is in state i after clustering + with metric 1, and it is in state j after clustering with metric 2, + we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states. + + Parameters + ---------- + assign1 : np.ndarray + Assignment array 1 + assign2 : np.ndarray + Assignment array 2 + + Returns + ---------- + new_assign : np.ndarray + New assignments array, renumbered from 0 ... N-1, N <= n1 * n2 + """ + + assert assign1.shape == assign2.shape, """Assignments must be + for the same set of trajectories.""" + new_ass = -1 * np.ones_like(assign1, dtype=np.int) + + nstates1 = np.max(assign1) + 1 + nstates2 = np.max(assign2) + 1 + + translations = np.reshape(np.arange(nstates1 * nstates2), + (nstates1, nstates2)) + + ass_shape = assign1.shape + for i in xrange(ass_shape[0]): + for j in xrange(ass_shape[1]): + #TODO: vectorize this loop + if assign1[i, j] == -1: + # No assignment here + assert assign2[i, j] == -1, """Assignments must be for + the same set of trajectories.""" + else: + new_ass[i, j] = translations[assign1[i, j], assign2[i, j]] + + new_assign = msmlib.renumber_states(new_ass) + + return new_assign diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py index c35a3bae..19f61d53 100644 --- a/src/python/metrics/solvent.py +++ b/src/python/metrics/solvent.py @@ -103,58 +103,3 @@ def prepare_trajectory(self, trajectory): fingerprints[:, i] = np.sum(distances, axis=1) return fingerprints - - -class OuterProductAssignment(object): - """Class to facilitate taking the outer product of the result of - two clusterings.""" - - def __init__(self, assign1, assign2): - """Create a distance metric to capture solvent degrees of freedom - - Parameters - ---------- - assign1 : ndarray - assignment array 1 - assign2 : ndarray - assignment array 2 - """ - self.ass1 = assign1 - self.ass2 = assign2 - - def get_product_assignments(self): - assert self.ass1.shape == self.ass2.shape, """Assignments must be - for the same set of trajectories.""" - new_ass = -1 * np.ones_like(self.ass1, dtype=np.int) - - nstates1 = np.max(self.ass1) + 1 - nstates2 = np.max(self.ass2) + 1 - - translations = np.reshape(np.arange(nstates1 * nstates2), - (nstates1, nstates2)) - - ass_shape = self.ass1.shape - for i in xrange(ass_shape[0]): - for j in xrange(ass_shape[1]): - if self.ass1[i, j] == -1: - # No assignment here - assert self.ass2[i, j] == -1, """Assignments must be for - the same set of trajectories.""" - else: - new_ass[i, j] = translations[self.ass1[i, j], self.ass2[i, j]] - - return new_ass - - - - - - - - - - - - - - From 420531061bd319d9985f0fed2e83a06679639b53 Mon Sep 17 00:00:00 2001 From: Matthew Harrigan Date: Fri, 14 Feb 2014 19:54:37 -0800 Subject: [PATCH 23/23] Renumbering --- src/python/assigning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/assigning.py b/src/python/assigning.py index 68ef807f..247b6b6b 100644 --- a/src/python/assigning.py +++ b/src/python/assigning.py @@ -264,7 +264,7 @@ def outer_product_assignment(assign1, assign2): assert assign1.shape == assign2.shape, """Assignments must be for the same set of trajectories.""" - new_ass = -1 * np.ones_like(assign1, dtype=np.int) + new_assign = -1 * np.ones_like(assign1, dtype=np.int) nstates1 = np.max(assign1) + 1 nstates2 = np.max(assign2) + 1 @@ -281,8 +281,8 @@ def outer_product_assignment(assign1, assign2): assert assign2[i, j] == -1, """Assignments must be for the same set of trajectories.""" else: - new_ass[i, j] = translations[assign1[i, j], assign2[i, j]] - - new_assign = msmlib.renumber_states(new_ass) + new_assign[i, j] = translations[assign1[i, j], assign2[i, j]] + # Renumber states in place + msmlib.renumber_states(new_assign) return new_assign