-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpair_id.py
More file actions
184 lines (148 loc) · 6.58 KB
/
Copy pathpair_id.py
File metadata and controls
184 lines (148 loc) · 6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import pandas as pd
import numpy as np
import itertools as it
class PairID(object):
"""Identify simulations in :class:`PSA` by method names and run numbers.
This is a rough convenience class that facilitates intuitive access to the
data generated by :class:`PSA`. PSA is based on performing all-pairs
comparisons, so that the data generated for a comparison between a pair of
simulations can be identified by (distance) matrix indices. Since the data
that :class:`PSA` can generate between a pair of simulations can be very
heterogeneous (i.e., data from Hausdorff pairs analyses), the data for all
the pairs is stored sequentially in a one-dimensional list whose order is
identical to the corresponding distance vector (of the distance matrix).
Simulations are added with :meth:`PairID.add_sim`, which takes the
name of the method and a list-like sequence of numbers corresponding the
runs that were performed. Once all simulations are added, the data
generated for a given pair of simulations can be accessed by first using
:meth:`PairID.get_pair_id` to get the comparison index and then
using that index to extract the data in :class:`PSA` stored in distance
vector form.
Notes::
1) The names and run labeling used for method that are added to
PairID do not need to be identical to those used for the
corresponding analysis in PSA. However, it is useful to keep the naming
scheme similar so that one can correctly identify simulations by name.
2) Currently, there is no mechanism to remove simulations from
:class:`PairID`, which requires modifying the simulation IDs in a
predictable manner. This feature may be added in the future. A user should
add simulations in a way that corresponds to how the simulations were added
to :class:`PSA`.
Example::
Obtain the frames corresponding to the Hausdorff pair of the second DIMS
simulation (i.e., DIMS 2) and third rTMD-F (i.e., rTMD-F 3) simulation
among a total of four methods with three runs each (consecutively labeled
'1','2','3'):
>>> method_names = ['DIMS','FRODA','rTMD-F','rTMD-S']
>>> identifier = PairID()
>>> for name in method_names:
>>> identifier.add_sim(name, [1,2,3])
>>> ID = identifier.get_pair_id('DIMS 2', 'rTMD-F 3')
>>> # Assume the simulations have been added to PSA as MDAnalysis Universes
>>> # in the above order (DIMS 1, ..., DIMS 3, FRODA 1, ..., FRODA 3, ...,
rTMD-S 2, rTMD-S 3)
>>> psa_hpa = PSA(universes, path_select='name CA')
>>> psa_hpa.generate_paths()
>>> psa_hpa.run_hausdorff_pairs_analysis(hausdorff_pairs=True)
>>> psa_hpa.HP['frames']
"""
def __init__(self):
"""Initialize a :class:`PairID` object.
Sets up labels for method names and run labels (IDs) and initializes a
pandas DataFrame object.
"""
self.ilbl = ['Name', 'Run ID']
self.clbl = ['Sim ID']
self.data = pd.DataFrame()
self.num_sims = 0
self.num_methods = 0
def add_sim(self, method, run_ids):
"""Add a simulation method and its run label to :class:`PairID`.
:Arguments:
*method*
string, name of the simulation method
*run_ids*
array-like, the number labels of the runs performed for *method*
"""
num_new_sims = len(run_ids)
tuples = list(it.product([method], run_ids))
df_idx = pd.MultiIndex.from_tuples(tuples, names=self.ilbl)
sim_ids = np.asarray(xrange(num_new_sims)) + self.num_sims
df_new = pd.DataFrame(sim_ids, df_idx, self.clbl)
self.data = self.data.append(df_new)
self.num_sims += num_new_sims
self.num_methods += 1 #len(self.data[self.column[0]].count())
def dvectorform(self, i, j):
"""Convert simulation IDs to a Pair ID.
Simulations added to :class:`PairID` are indexed by a unique
integer ID. Two integer IDs correspond to a location in the matrix
representing all comparison between pairs of simulations in PSA. The
comparison matrix indices care converted to an index in a corresponding
comparison vector analogously to conversion between a distance matrix
and a distance vector.
:Arguments:
*i*
int, row index
*j*
int, column index
:Returns:
int, the Pair ID of the pair of simulations *i* and *j*
"""
if i == j:
raise ValueError("Indices cannot have the same value.")
if j < i:
temp, i = i, j
j = temp
return (self.num_sims*i) + j - (i+2)*(i+1)/2
def get_pair_id(self, sim1, sim2, vectorform=True):
"""Get the Pair ID of a pair of simulations.
Note: the names of simulations are assumed to take the following form:
'<Name> <Run ID>', e.g., 'DIMS 1' or 'rTMD-S 2'.
:Arguments:
*sim1*
string, name of first simulation in comparison
*sim2*
string, name of first simulation in comparison
:Returns:
int, the Pair ID of the comparison between *sim1* and *sim2*
"""
i, j = self.get_sim_id(sim1), self.get_sim_id(sim2)
try:
return self.dvectorform(i, j) if vectorform else (i, j)
except ValueError:
print("Must enter two different simulations.")
def get_sim_id(self, sim):
"""Obtain the simulation ID of a given simulation.
Note: the names of simulations are assumed to take the following form:
'<Name> <Run ID>', e.g., 'DIMS 1' or 'rTMD-S 2'.
:Arguments:
*sim*
string, full simulation name with run label separated by a space
:Returns:
int, the simulation ID
"""
sim_tuple = self._str2tup(sim)
return self.data.loc[sim_tuple, self.clbl[0]]
def _str2tup(self, name):
"""Return a string of the form '<name> <number>' as a tuple with the
form ('<name>', <number>).
:Arguments:
*name*
string, name and number separated by a space
:Returns:
(string, int), the tuple containing the name and number
"""
method, run_id = name.split()
return method, int(run_id)
def get_num_sims(self):
"""
:Returns:
int, total number of simulations in :class:`PairID`
"""
return self.num_sims
def get_num_methods(self):
"""
:Returns:
int, total number of simulation methods in :class:`PairID`
"""
return self.num_method