Skip to content

Commit 18488dd

Browse files
authored
Add JUMP analysis figure (#55)
* add first two panels of JUMP figure * add sup fig 9 * add updated figures * remove phenotype profile umap * add/remove jump processed files * finalize figure 5 * response to review
1 parent 6864205 commit 18488dd

11 files changed

Lines changed: 1432 additions & 964 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.

3.evaluate_model/process_jump_phenotype_profiles.ipynb

Lines changed: 18 additions & 808 deletions
Large diffs are not rendered by default.

3.evaluate_model/scripts/nbconverted/process_jump_phenotype_profiles.py

Lines changed: 9 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,10 @@
1313
#
1414
# 1. Load in this data from the JUMP-single-cell repo
1515
# 2. Summarize replicate KS test metrics (mean value) and align across cell types and time variables
16-
# 3. Explore the top results per phenotype/treatment_type/model_type
16+
# 3. Explore the top results per phenotype/treatment_type/model_type (Supplementary Table S1)
1717
# 4. Convert it to wide format
1818
#
1919
# This wide format represents a "phenotypic profile" which we can use similarly as an image-based morphology profile.
20-
#
21-
# We also fit a UMAP to this phenotypic profile for downstream visualization.
2220

2321
# In[1]:
2422

@@ -33,46 +31,6 @@
3331
# In[2]:
3432

3533

36-
def umap_phenotype(
37-
phenotype_df: pd.DataFrame,
38-
feature_columns: List[str],
39-
metadata_columns: List[str],
40-
n_components: int,
41-
random_seed: int,
42-
model_type: str
43-
) -> pd.DataFrame:
44-
"""
45-
Fit a UMAP (Uniform Manifold Approximation and Projection) model on the provided phenotype profile and return a transformed DataFrame with metadata.
46-
47-
Parameters:
48-
- phenotype_df (pd.DataFrame): DataFrame containing the phenotype profile with both feature and metadata columns.
49-
- feature_columns (List[str]): List of column names in phenotype_df that represent the features to be used for UMAP embedding.
50-
- metadata_columns (List[str]): List of column names in phenotype_df that represent metadata to be retained in the output.
51-
- n_components (int): Number of dimensions for the UMAP embedding.
52-
- random_seed (int): Random seed for reproducibility of the UMAP model.
53-
- model_type (str): Identifier for the model type, to be added as a column in the output DataFrame.
54-
55-
Returns:
56-
- umap_embeddings_with_metadata_df (pd.DataFrame): DataFrame with UMAP embeddings and specified metadata columns, including an additional 'model_type' column.
57-
"""
58-
59-
# Initialize UMAP
60-
umap_fit = umap.UMAP(random_state=random_seed, n_components=n_components)
61-
62-
# Fit UMAP and convert to pandas DataFrame
63-
embeddings = pd.DataFrame(
64-
umap_fit.fit_transform(phenotype_df.loc[:, feature_columns]),
65-
columns=[f"UMAP{x}" for x in range(0, n_components)],
66-
)
67-
68-
# Combine with metadata
69-
umap_embeddings_with_metadata_df = pd.concat([phenotype_df.loc[:, metadata_columns], embeddings], axis=1).assign(model_type=model_type)
70-
return umap_embeddings_with_metadata_df
71-
72-
73-
# In[3]:
74-
75-
7634
# Set file paths
7735
# JUMP phenotype probabilities from AreaShape model
7836
commit = "4225e427fd9da59159de69f53be65c31b4d4644a"
@@ -86,7 +44,7 @@ def umap_phenotype(
8644
n_top_results_to_explore = 10
8745

8846

89-
# In[4]:
47+
# In[3]:
9048

9149

9250
# Set output files
@@ -97,12 +55,10 @@ def umap_phenotype(
9755
final_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles.tsv.gz")
9856
shuffled_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles_shuffled.tsv.gz")
9957

100-
jump_umap_file = pathlib.Path(output_dir, "jump_phenotype_profiling_umap.tsv.gz")
101-
10258

10359
# ## Load and process data
10460

105-
# In[5]:
61+
# In[4]:
10662

10763

10864
# Load KS test results and drop uninformative columns
@@ -115,7 +71,7 @@ def umap_phenotype(
11571
jump_pred_df.head()
11672

11773

118-
# In[6]:
74+
# In[5]:
11975

12076

12177
# Process data to match treatments and scores across cell types
@@ -163,7 +119,7 @@ def umap_phenotype(
163119
jump_pred_compare_df.head()
164120

165121

166-
# In[7]:
122+
# In[6]:
167123

168124

169125
# Focus on the top results for downstream interpretation
@@ -182,133 +138,30 @@ def umap_phenotype(
182138

183139
# ## Summarize data
184140

185-
# In[8]:
141+
# In[7]:
186142

187143

188144
# How many unique plates?
189145
jump_pred_df.Metadata_Plate.nunique()
190146

191147

192-
# In[9]:
148+
# In[8]:
193149

194150

195151
# How many different individual treatments?
196152
jump_pred_df.query("Metadata_model_type == 'final'").treatment_type.value_counts()
197153

198154

199-
# In[10]:
155+
# In[9]:
200156

201157

202158
# How many unique treatments per treatment type?
203159
jump_pred_df.groupby("treatment_type").treatment.nunique()
204160

205161

206-
# In[11]:
162+
# In[10]:
207163

208164

209165
# How many treatments with phenotype predictions?
210166
jump_pred_df.query("Metadata_model_type == 'final'").phenotype.value_counts()
211167

212-
213-
# ## Convert data to phenotypic profiles
214-
215-
# In[12]:
216-
217-
218-
metadata_columns = [
219-
"Metadata_Plate",
220-
"treatment",
221-
"treatment_type",
222-
"Cell_type",
223-
"Time",
224-
"Metadata_Well",
225-
"cell_count"
226-
]
227-
228-
229-
# In[13]:
230-
231-
232-
jump_wide_final_df = (
233-
jump_pred_df
234-
.query("Metadata_model_type == 'final'")
235-
.drop(columns=["p_value"])
236-
.pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
237-
.reset_index()
238-
)
239-
240-
jump_wide_final_df.to_csv(final_jump_phenotype_file, sep="\t", index=False)
241-
242-
print(jump_wide_final_df.shape)
243-
jump_wide_final_df.head()
244-
245-
246-
# In[14]:
247-
248-
249-
jump_wide_shuffled_df = (
250-
jump_pred_df
251-
.query("Metadata_model_type == 'shuffled'")
252-
.drop(columns=["p_value"])
253-
.pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
254-
.reset_index()
255-
)
256-
257-
jump_wide_shuffled_df.to_csv(shuffled_jump_phenotype_file, sep="\t", index=False)
258-
259-
print(jump_wide_shuffled_df.shape)
260-
jump_wide_shuffled_df.head()
261-
262-
263-
# ## Apply UMAP to phenotypic profiles
264-
265-
# In[15]:
266-
267-
268-
umap_random_seed = 123
269-
umap_n_components = 2
270-
271-
feature_columns = jump_wide_final_df.drop(columns=metadata_columns).columns.tolist()
272-
print(len(feature_columns))
273-
274-
275-
# In[16]:
276-
277-
278-
umap_with_metadata_df = umap_phenotype(
279-
phenotype_df=jump_wide_final_df,
280-
feature_columns=feature_columns,
281-
metadata_columns=metadata_columns,
282-
n_components=umap_n_components,
283-
random_seed=umap_random_seed,
284-
model_type="final"
285-
)
286-
287-
288-
# In[17]:
289-
290-
291-
umap_shuffled_with_metadata_df = umap_phenotype(
292-
phenotype_df=jump_wide_shuffled_df,
293-
feature_columns=feature_columns,
294-
metadata_columns=metadata_columns,
295-
n_components=umap_n_components,
296-
random_seed=umap_random_seed,
297-
model_type="shuffled"
298-
)
299-
300-
301-
# In[18]:
302-
303-
304-
# Output file
305-
umap_full_df = pd.concat([
306-
umap_with_metadata_df,
307-
umap_shuffled_with_metadata_df
308-
], axis="rows")
309-
310-
umap_full_df.to_csv(jump_umap_file, sep="\t", index=False)
311-
312-
print(umap_full_df.shape)
313-
umap_full_df.head()
314-

0 commit comments

Comments
 (0)