move tmp funcs to devel

Pietro ANDREI · Pietro ANDREI · commit 4bc7e096adba · 2025-04-27T15:37:01.000+01:00
diff --git a/R/03_NB_based_cell_comparison.R b/R/03_NB_based_cell_comparison.R
@@ -195,136 +195,3 @@ nbCluster = function(seurat=NULL,n_clust=3:12,seed=347548){
 #  }
   return(seurat)
 }
-
-##Alternative function for NB clustering. To be commented...
-#test_cluster2 = function(seurat=NULL,k_range = 4:15,max_clust=20,seed=347548){
-#  nnmat = KanData(seurat,'nnMat')$nnMat
-#  tot_nn = nnmat[,'tot_nn']
-#  tot_nn[tot_nn == 0] = 1
-#  nnmat[,'tot_nn'] = NULL
-#  nnmat = nnmat %>% dplyr::select_if(is.numeric)
-#  scaled_nnmat = Matrix::Diagonal(x=(1/tot_nn),names = T) %*% as(as.matrix(nnmat),'CsparseMatrix')
-#  rownames(scaled_nnmat) = rownames(nnmat)
-#  scaled_nnmat = as.data.frame(as.matrix(scaled_nnmat))
-#  clusters = lapply(k_range,function(x){
-#    set.seed(seed)
-#    clust = as.data.frame(kmeans(scaled_nnmat,centers=x,iter.max = 20)$cluster)
-#    colnames(clust) = paste0('K_',x)
-#    return(clust)
-#  })
-#  clusters = purrr::reduce(clusters,cbind) %>% mutate_all(as.factor)
-#  set.seed(seed)
-#  clusters = greed::greed(clusters,model=greed::Lca(),alg = greed::Seed(), K=max_clust)
-#  clusters = greed::clustering(clusters)
-#  names(clusters) = rownames(na.omit(scaled_nnmat))
-#  clusters[setdiff(colnames(seurat),names(clusters))] = NA
-#  seurat[["LCA_nbClust"]] = as.character(clusters[colnames(seurat)])
-#  return(seurat)
-#}
-
-
-#' @title Calculate Shannon entropy of cell neighbourhood composition
-#' @name nb_entropy
-#' @description
-#' Given cell type composition of single-cell neighbourhood, calculate Shannon entropy associated with each neighbourhood
-#' @param seurat seurat object containing Kandinsky data slot with pre-computed neighbourhood composition matrix in `nnMat` slot
-#' @returns seurat object with new metadata variable reporting neighbourhood entropy for each cell
-#' @export
-nb_entropy = function(seurat){
-  nnmat = KanData(seurat,'nnMat')$nnMat
-  tot_nn = nnmat[,'tot_nn']
-  tot_nn[tot_nn == 0] = 1
-  nnmat[,'tot_nn'] = NULL
-  nnmat = nnmat %>% dplyr::select_if(is.numeric)
-  scaled_nnmat = Matrix::Diagonal(x=(1/tot_nn)) %*% as(as.matrix(nnmat),'CsparseMatrix')
-  seurat$nb_entropy = apply(scaled_nnmat,1,function(x){
-    logv = log2(x)
-    logv = ifelse(logv == -Inf,0,logv)
-    return(-sum(x*logv))})
-  return(seurat)
-}
-
-
-#' @title Neighbourhood-based multiple linear regression of gene expression values
-#' @name nb_lm
-#' @description
-#' Fit a multiple linear regression for a gene of interest using neighbourhood cell count for each cell type as independent variables
-#'
-#' @details
-#' This function is based on fastLm() function from RcppEigen pakage. In case user wanted to include one or more confounfing (random) variables into the model,
-#' the function lmer() in instead used to fit a mixed effect model. In both cases, the output is a data frame reporting regression coefficient and pvalue for each independent variable.
-#'
-#' @param seurat a Seurat object containing Kandinsky data slot
-#' @param label character string specifying the name of the variable to be used for cell type annotation.
-#' @param which name of the cell type to consider for fitting linear models. Currently only one cell type is accepted as input
-#' @param features character vector containing gene identifiers to be fitted
-#' @param layer name of Seurat layer to use for gene expression data
-#' @param rand name of random variables to be included in a mixed effect model.
-#' @param minprop numeric, genes expressed in less than this proportion of cells will be excluded from the analysis
-#'
-#' @returns data frame reporting, for each gene, the regression coefficient and p-value associated with each cell type
-#' @export
-#'
-nb_lm = function(seurat,label=NULL,which=NULL,features=NULL,layer='data',rand=NULL,minprop=0.1){
-  if(length(KanData(seurat,'nnMat'))>0){
-    mat = KanData(seurat,'nnMat')$nnMat
-  }else{
-    mat = nnMat(seurat,label=label,return.seurat=F)
-  }
-  if(!is.null(rand)){
-    rands = FetchData(seurat,rand)
-    rands = rands[rownames(mat),,drop=F]
-    mat[,colnames(rands)] = rands
-  }
-  label = label %||% KanData(seurat,'nnMat')$col.anno
-  features = features %||% rownames(seurat)
-  if(is.null(which)){
-    warning('argument "which" not specified. All cells in the dataset will be considered as a unique class')
-  }else{
-    cells = rownames(seurat@meta.data[which(seurat@meta.data[[label]] %in% which),])
-    mat = mat[cells,]
-  }
-  #Prepare gene expression data (keep only expressed genes)
-  expr = LayerData(seurat,layer)
-  expr = expr[,rownames(mat)]
-  keep = which(rowSums(expr >0) >=round(ncol(expr)*minprop))
-  expr = expr[keep,]
-  features = intersect(features,rownames(expr))
-
-  message('testing ',length(features),' genes detected in at least ',round(ncol(expr)*minprop),' cells')
-  expr = expr[features,]
-
-  #Prepare linear model formula
-  labels = unique(as.character(seurat@meta.data[,c(label)]))
-  labels = intersect(labels,colnames(mat))
-  if(is.null(rand)){
-    formula =paste0('RcppEigen::fastLm( scale(expr[x,]) ~ ',paste(paste0('mat[,"',labels,'"]'),collapse="+"),')')
-  }else{
-    formula =paste0('lmerTest::lmer( scale(expr[x,]) ~ ',paste(paste0('mat[,"',labels,'"]'),collapse="+"),'+',
-                    paste(paste0('(1|mat[,"',rand,'"])'),collapse="+"),',control=lme4::lmerControl(calc.derivs = FALSE,optCtrl = list(algorithm = "NLOPT_LN_BOBYQA")))')
-  }
-  if(is.null(rand)){
-    message('Fitting linear models for ',length(features),' genes...')
-  }else{
-    message('Fitting linear mixed models (',paste(rand,collapse=','),' as random effect(s))',' for ',length(features),' genes...')
-  }
-  #Fit linear (mixed) model for all genes to be tested
-  all_genes = lapply(features,function(x){
-    flmmod = suppressMessages(eval(parse(text=formula)))
-    if(is.null(rand)){
-      res = as.data.frame(summary(flmmod)[[1]][-1,c(3,4)])
-    }else{
-      res = as.data.frame(stats::anova(flmmod)[,c(5,6)])
-    }
-    res$gene = x
-    res$var = labels
-    rownames(res) = NULL
-    return(res)
-  })
-  #Combine results from all genes into a unique table
-  all_genes = purrr::reduce(all_genes,rbind)
-  colnames(all_genes)[c(1,2)] = c('stat','pval')
-  colnames(all_genes)[4] = label
-  message('...done!')
-  return(all_genes)
-}
diff --git a/R/06_Utils.R b/R/06_Utils.R
@@ -263,54 +263,6 @@ resample_cells = function(seurat=NULL,label='cell_types',spatial=T,maxcells=1000
   }
 }
 
-
-#' @title subsample neighbour network links
-#' @name subsample_nb
-#' @description
-#' Given a Kandinsky neighbour network in a listw format, this function randomly subset
-#' the number of neighbours assigned to each cell up to a user-defined maximum number of neighbours.
-#' @details
-#' In order to be compatible with most of Kandinsky functions (due to its dependency on spdep package), the resulting
-#' neighbour matrix must be symmetric even after the subsetting. Therefore, an additional step to ensure matrix symmetry
-#' is applied after the subsetting, and for this reason some cells might still have a number of neighbours higher than the user-provided threshold.
-#'
-#' @param seurat a Seurat object containing Kandinsky data slot
-#' @param exp_links numeric, maximum number of expected neighbours per cell after the subsetting
-#' @param seed numeric, random seed for reproducibility
-#' @returns updated seurat object network in a listw format
-#'
-#' @export
-subsample_nb = function(seurat=NULL,exp_links=6,seed=347548){
-  mat= as(KanData(seurat,'nb'),'CsparseMatrix')
-  #if(inherits(listw,'listw')){data='listw';listw=as(listw,'CsparseMatrix')}
-  avg_size= median(Matrix::rowSums(mat))
-  message('Current median number of neighbour links per cell is ',avg_size,'. Trying to reduce to ',exp_links,'...')
-  #ratio = (exp_links/avg_size)
-  #prop = (1-ratio/2)
-  set.seed(seed)
-  colN <- diff(mat@p)
-  indx <- cbind(mat@i+1,rep(seq_along(colN),colN))
-  indx = cbind(indx,1:length(mat@x))
-  indx = as.data.frame(indx)
-  indx = dplyr::group_by(indx,.data[["V2"]]) %>%
-    dplyr::mutate(size=ifelse(dplyr::n()>(exp_links/2),(exp_links/2),dplyr::n())) %>%
-    dplyr::filter(.data[["V1"]] %in% sample(.data[["V1"]],size=unique(.data[["size"]]))) %>%
-    dplyr::ungroup() %>%
-    dplyr::select(.data[["V3"]])
-
-  #mat@x[sample(length(mat@x),size=round(length(mat@x)*prop))] = 0
-  mat@x[setdiff(1:length(mat@x),indx$V3)]=0
-  mat = Matrix::drop0(mat)
-  message('Making neighbour matrix symmetric...')
-  mat = mat + Matrix::t(mat)
-  mat@x[mat@x>1]=1
-  new_avg_size = median(Matrix::rowSums(mat))
-  message('Returning updated neighour network with a median number of ',new_avg_size,' links per cell')
-  KanData(seurat,'nb') = suppressWarnings(spdep::mat2listw(mat,row.names=rownames(mat),style = 'B',zero.policy=T))
-  return(seurat)
-}
-
-
 #' @name global_univ_spatcor
 #' @title Compute global Moran's I spatial autocorrelation statistic
 #' @param seurat a Seurat object containing Kandinsky data (`KanData()`)
@@ -428,37 +380,6 @@ get_nbcounts = function(seurat=NULL,label=NULL,which=NULL){
   return(nb)
 }
 
-#' @title Calculate gene-specific contamination score
-#' @name gene_contamination_score
-#' @description
-#' For each gene expressed within a dataset, given a cell type of interest,
-#'  this function will calculate a contamination score expressed as the fold change between the average expression of each cell belonging to the defined cell type
-#'  and the average expression of their neighbouring cells defined by Kandinsky belonging to different cell types. A high or low contamination score indicates a lower or higher expression of a gene within a cell than its neighbouring cells, respectively.
-#'
-#' @param seurat Seurat object containing Kandinsky data slot
-#' @param label character string indicating meta data variable containing cell type annotation
-#' @param which character vector indicating for which cell type the gene contamination scores will be calculated
-#' @returns named vector of contamination scores, with each score named with its corresponding gene symbol
-#' @export
-gene_contamination_score = function(seurat=NULL,label=NULL,which=NULL){
-  envmat = get_nbcounts(seurat,label,which)
-  if(length(KanData(seurat,'nnMat'))>0){
-    tot_nn = KanData(seurat,'nnMat')$nnMat[,'tot_nn']
-  }else{
-    tot_nn = nnMat(seurat,label=label,return.seurat=F)[,'tot_nn']
-  }
-  if(!is.null(label) & !is.null(which)){
-    tot_nn = tot_nn[seurat@meta.data[[label]] %in% which]
-    selfmat = Matrix::rowMeans(LayerData(seurat,'counts')[,seurat@meta.data[[label]] %in% which])
-  }else{
-    selfmat = Matrix::rowMeans(LayerData(seurat,'counts'))
-  }
-  envmat = Matrix::t(Matrix::t(envmat) %*% Matrix::Diagonal(x=1/tot_nn))
-  envmat = Matrix::colMeans(envmat)
-  scores = setNames(nm=rownames(seurat),object=envmat/selfmat)
-  return(scores)
-}
-
 #################################
 ###########COSMX UTILS###########
 #################################