|
| 1 | +""" |
| 2 | +Figure 4 -- Recreation of Boxplot of the correlation distance for related cells |
| 3 | +(blue), and randomly sampled cells from the GEMLI paper |
| 4 | +""" |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +import numpy as np |
| 8 | +import seaborn as sns |
| 9 | +from scipy.spatial.distance import pdist |
| 10 | +from pf2barcode.imports import import_CCLE |
| 11 | +from .common import getSetup, subplotLabel |
| 12 | +from sklearn.preprocessing import scale |
| 13 | + |
| 14 | + |
| 15 | +def makeFigure(): |
| 16 | + """Boxplot of correlation distance for related and random cells per lineage.""" |
| 17 | + ax, f = getSetup((8, 4), (1, 1)) |
| 18 | + subplotLabel(ax) |
| 19 | + |
| 20 | + X = import_CCLE(pca_option="none") |
| 21 | + |
| 22 | + # Filter out unknown or rare barcodes |
| 23 | + X = X[X.obs["SW"] != "unknown"] |
| 24 | + good_SW = X.obs["SW"].value_counts().index[X.obs["SW"].value_counts() > 10] |
| 25 | + X = X[X.obs["SW"].isin(good_SW)] |
| 26 | + |
| 27 | + # Convert matrix to dense for correlation computation |
| 28 | + mat = X.X.toarray() |
| 29 | + df = X.obs.copy() |
| 30 | + df["index"] = np.arange(len(df)) |
| 31 | + |
| 32 | + results = [] |
| 33 | + |
| 34 | + for sw, cells in df.groupby("SW"): |
| 35 | + idx = cells["index"].values |
| 36 | + if len(idx) < 2: |
| 37 | + continue |
| 38 | + |
| 39 | + # Related (within-lineage) distances |
| 40 | + related = pdist(mat[idx], metric="correlation") |
| 41 | + |
| 42 | + # Random (same number of pairs) |
| 43 | + n_pairs = len(related) |
| 44 | + n_cells = mat.shape[0] |
| 45 | + random_corrs = [] |
| 46 | + for _ in range(100): |
| 47 | + pairs = np.random.choice(n_cells, (n_pairs, 2), replace=True) |
| 48 | + random_corrs.extend( |
| 49 | + [1 - np.corrcoef(mat[i], mat[j])[0, 1] for i, j in pairs] |
| 50 | + ) |
| 51 | + |
| 52 | + results.append( |
| 53 | + pd.DataFrame( |
| 54 | + { |
| 55 | + "Correlation distance": np.concatenate([related, random_corrs]), |
| 56 | + "Group": ["Cell lineage"] * len(related) |
| 57 | + + ["Random cells"] * len(random_corrs), |
| 58 | + "Lineage": [sw] * (len(related) + len(random_corrs)), |
| 59 | + } |
| 60 | + ) |
| 61 | + ) |
| 62 | + |
| 63 | + df_plot = pd.concat(results, ignore_index=True) |
| 64 | + |
| 65 | + sns.boxplot( |
| 66 | + data=df_plot, |
| 67 | + x="Lineage", |
| 68 | + y="Correlation distance", |
| 69 | + hue="Group", |
| 70 | + showfliers=False, |
| 71 | + palette={"Cell lineage": "#377eb8", "Random cells": "#bbbbbb"}, |
| 72 | + ax=ax[0], |
| 73 | + ) |
| 74 | + |
| 75 | + ax[0].set_title("Correlation distance for related and random cells per lineage") |
| 76 | + ax[0].set_xlabel("Lineage barcode (SW)") |
| 77 | + ax[0].set_ylabel("Correlation distance") |
| 78 | + ax[0].legend(title=None) |
| 79 | + |
| 80 | + return f |
0 commit comments