11from pathlib import Path
22
3- import anndata
4- import hdf5plugin # noqa: F401
53import numpy as np
64import pandas as pd
75import scanpy as sc
6+ from anndata import AnnData , concat
7+ from anndata .io import read_text
88from scipy .sparse import csr_array , csr_matrix
99from scipy .special import xlogy
10+ from sklearn .preprocessing import scale
1011from sklearn .utils .sparsefuncs import (
1112 inplace_column_scale ,
1213 mean_variance_axis ,
1314)
1415
1516
16- def prepare_dataset (X : anndata . AnnData , geneThreshold : float ) -> anndata . AnnData :
17+ def prepare_dataset (X : AnnData , geneThreshold : float ) -> AnnData :
1718 assert isinstance (X .X , csr_matrix )
1819 assert np .amin (X .X .data ) >= 0.0
1920
@@ -37,7 +38,7 @@ def prepare_dataset(X: anndata.AnnData, geneThreshold: float) -> anndata.AnnData
3738 return X
3839
3940
40- def prepare_dataset_dev (X : anndata . AnnData ) -> anndata . AnnData :
41+ def prepare_dataset_dev (X : AnnData ) -> AnnData :
4142 X .X = csr_array (X .X ) # type: ignore
4243 assert np .amin (X .X .data ) >= 0.0
4344
@@ -88,7 +89,7 @@ def prepare_dataset_dev(X: anndata.AnnData) -> anndata.AnnData:
8889 return X
8990
9091
91- def import_CCLE (pca_option = "dev_pca" ) -> anndata . AnnData :
92+ def import_CCLE (pca_option = "dev_pca" , n_comp = 10 ) -> AnnData :
9293 # pca option should be passed as either pca or glm_pca
9394 """Imports barcoded cell data."""
9495 adatas = {}
@@ -103,7 +104,7 @@ def import_CCLE(pca_option="dev_pca") -> anndata.AnnData:
103104 # "T1_MDAMB231",
104105 "T2_MDAMB231" ,
105106 ):
106- data = anndata . read_text (current_dir / "data" / f"{ name } _count_mtx.tsv.bz2" ).T
107+ data = read_text (current_dir / "data" / f"{ name } _count_mtx.tsv.bz2" ).T
107108 barcodes = pd .read_csv (
108109 current_dir / "data" / f"{ name } _SW.txt" , sep = "\t " , index_col = 0 , header = 0
109110 )
@@ -115,7 +116,7 @@ def import_CCLE(pca_option="dev_pca") -> anndata.AnnData:
115116 barcode_dfs .append (barcodes )
116117 adatas [name ] = data
117118
118- X = anndata . concat (adatas , label = "sample" , index_unique = "-" )
119+ X = concat (adatas , label = "sample" , index_unique = "-" )
119120 X .X = csr_matrix (X .X )
120121
121122 counts = X .obs ["SW" ].value_counts ()
@@ -132,9 +133,11 @@ def import_CCLE(pca_option="dev_pca") -> anndata.AnnData:
132133 # conditional statement for either dev_pca or pca
133134 if pca_option == "dev_pca" :
134135 X = prepare_dataset_dev (X )
135- sc .pp .pca (X , n_comps = 20 , svd_solver = "arpack" )
136+ X .X = scale (X .X )
137+ sc .pp .pca (X , n_comps = n_comp , svd_solver = "arpack" )
136138 else :
137139 X = prepare_dataset (X , geneThreshold = 0.001 )
138- sc .pp .pca (X , n_comps = 20 , svd_solver = "arpack" )
140+ X .X = scale (X .X )
141+ sc .pp .pca (X , n_comps = n_comp , svd_solver = "arpack" )
139142
140143 return X
0 commit comments