Skip to content

Commit 555ab31

Browse files
committed
new seqGetData(, .tolist=NA)
1 parent 57984fb commit 555ab31

File tree

8 files changed

+89
-23
lines changed

8 files changed

+89
-23
lines changed

DESCRIPTION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ Package: SeqArray
22
Type: Package
33
Title: Data management of large-scale whole-genome sequence variant calls
44
using GDS files
5-
Version: 1.47.7
5+
Version: 1.47.8
66
Date: 2025-03-31
77
Depends: R (>= 3.5.0), gdsfmt (>= 1.31.1)
8-
Imports: methods, parallel, digest, IRanges, GenomicRanges, GenomeInfoDb,
9-
Biostrings, S4Vectors
8+
Imports: methods, parallel, digest, S4Vectors, IRanges, GenomicRanges,
9+
GenomeInfoDb, Biostrings
1010
LinkingTo: gdsfmt
1111
Suggests: Biobase, BiocGenerics, BiocParallel, RUnit, Rcpp, SNPRelate, crayon,
1212
knitr, markdown, rmarkdown, Rsamtools, VariantAnnotation

NAMESPACE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ importFrom(utils, read.table, flush.console, write.table, str, tail,
3636

3737
importFrom(S4Vectors, DataFrame, SimpleList)
3838
importClassesFrom(IRanges, IRanges)
39-
importFrom(IRanges, IRanges, IntegerList, NumericList, CharacterList,
40-
DataFrameList)
39+
importFrom(IRanges, IRanges, LogicalList, IntegerList, NumericList,
40+
CharacterList, RawList, FactorList, DataFrameList)
4141
importClassesFrom(GenomicRanges, GRanges, GRangesList)
4242
importFrom(GenomicRanges, GRanges)
4343
importMethodsFrom(GenomicRanges, granges)

NEWS

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,28 @@
11
CHANGES IN VERSION 1.48.0
22
-------------------------
33

4-
UTILITIES
5-
6-
o Tweak display in `seqResetVariantID()`
7-
8-
o new argument 'start' in `seqResetVariantID()`
4+
NEW FEATURES
95

106
o `seqAddValue()`: use bit1 for a logical vector; new argument
117
'use_float32' for storing double
128

13-
o use `crayon::silver()` instead of `crayon::blurred()` in the display since
14-
RStudio blurs the screen output
9+
o new argument 'start' in `seqResetVariantID()`
1510

1611
o new argument 'digest' in `seqRecompress()` to add MD5 hash codes
1712

1813
o `seqGetData(, "$chromosome")` returns chromosome codes in an object of
1914
'S4Vectors::Rle'
2015

16+
o `seqGetData(, .tolist=NA)` returns an extended list defined in IRanges
17+
(e.g., IntegerList) when it is applicable
18+
19+
UTILITIES
20+
21+
o Tweak display in `seqResetVariantID()`
22+
23+
o use `crayon::silver()` instead of `crayon::blurred()` in the display since
24+
RStudio blurs the screen output
25+
2126
BUG FIXES
2227

2328
o `seqBlockApply()` should recover the filter when the user-specified

R/Utilities.R

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,21 @@
1010
#######################################################################
1111

1212
# R expressions called in C internally
13-
lang_eval <- list(
13+
.lang_eval <- list(
1414
new_rle = quote(new("Rle", values=values, lengths=lengths))
1515
)
1616

17+
# Compressed Lists in IRanges
18+
.List_IRanges_value <- list(
19+
LogicalList(), IntegerList(), NumericList(),
20+
CharacterList(), RawList(), FactorList(),
21+
"unlistData", "partitioning", "end"
22+
)
23+
1724
.onLoad <- function(lib, pkg)
1825
{
19-
.Call(SEQ_Pkg_Init, .dim_name, process_count, process_index, lang_eval)
26+
.Call(SEQ_Pkg_Init, .dim_name, process_count, process_index,
27+
.lang_eval, .List_IRanges_value)
2028
TRUE
2129
}
2230

man/seqBlockApply.Rd

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ seqBlockApply(gdsfile, var.name, FUN, margin=c("by.variant"),
4040
\item{.padNA}{\code{TRUE}, pad a variable-length vector with NA if the
4141
number of data points for each variant is not greater than 1}
4242
\item{.tolist}{if \code{TRUE}, return a list of vectors instead of the
43-
structure \code{list(length, data)} for variable-length data}
43+
structure \code{list(length, data)} for variable-length data; \code{NA}
44+
, return a compressed \code{List} defined in \pkg{IRanges} when it is
45+
applicable}
4446
\item{.progress}{if \code{TRUE}, show progress information}
4547
\item{...}{optional arguments to \code{FUN}}
4648
}

man/seqGetData.Rd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ seqGetData(gdsfile, var.name, .useraw=FALSE, .padNA=TRUE, .tolist=FALSE,
1919
\item{.padNA}{\code{TRUE}, pad a variable-length vector with NA if the
2020
number of data points for each variant is not greater than 1}
2121
\item{.tolist}{if \code{TRUE}, return a list of vectors instead of the
22-
structure \code{list(length, data)} for variable-length data}
22+
structure \code{list(length, data)} for variable-length data; \code{NA}
23+
, return a compressed \code{List} defined in \pkg{IRanges} when it is
24+
applicable}
2325
\item{.envir}{\code{NULL}, an environment object, a list or a
2426
\code{data.frame}}
2527
}
@@ -146,6 +148,10 @@ seqGetData(f, "annotation/info/AA", .padNA=FALSE)
146148
# or return a simplified vector
147149
seqGetData(f, "annotation/info/AA", .padNA=TRUE)
148150

151+
# return a compressed list (CharacterList)
152+
seqGetData(f, "annotation/info/AA", .padNA=FALSE, .tolist=NA)
153+
154+
149155
# get annotation/format/DP, a variable-length dataset
150156
seqGetData(f, "annotation/format/DP")
151157
# $length <- indicating the length of each variable-length data

src/GetData.cpp

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -855,14 +855,53 @@ static SEXP get_list(SEXP len, SEXP val, size_t elmsize, bool is_factor)
855855
memcpy(RAW(vv), &RAW(val)[pt], nn);
856856
break;
857857
default:
858-
throw ErrSeqArray("Not support data type for .tolist=TRUE.");
858+
throw ErrSeqArray("Not support data type when '.tolist=TRUE'.");
859859
}
860860
pt += nn;
861861
}
862862
}
863863
return rv_ans;
864864
}
865865

866+
867+
// .List_IRanges_value
868+
extern "C" SEXP OBJ_CompressedList = NULL;
869+
870+
/// convert to a compressed list in IRanges
871+
static SEXP get_list2(SEXP len, SEXP val, bool is_factor)
872+
{
873+
// get data type
874+
int type_idx = -1;
875+
switch (TYPEOF(val))
876+
{
877+
case LGLSXP: type_idx = 0; break;
878+
case INTSXP: type_idx = 1; break;
879+
case REALSXP: type_idx = 2; break;
880+
case STRSXP: type_idx = 3; break;
881+
case RAWSXP: type_idx = 4; break;
882+
default:
883+
throw ErrSeqArray("Not support data type when '.tolist=NA'.");
884+
}
885+
if (is_factor) type_idx = 5;
886+
// new object
887+
SEXP rv_ans = Rf_duplicate(VECTOR_ELT(OBJ_CompressedList, type_idx));
888+
PROTECT(rv_ans);
889+
// set @unlistData (OBJ_CompressedList[6]="unlistData")
890+
R_do_slot_assign(rv_ans, VECTOR_ELT(OBJ_CompressedList, 6), val);
891+
// get @partitioning (OBJ_CompressedList[7]="partitioning")
892+
SEXP pt = R_do_slot(rv_ans, VECTOR_ELT(OBJ_CompressedList, 7));
893+
// set @partitioning@end (OBJ_CompressedList[8]="end")
894+
const int n = Rf_length(len);
895+
SEXP end = PROTECT(NEW_INTEGER(n));
896+
const int *s = INTEGER(len);
897+
int *p = INTEGER(end), ed = 0;
898+
for (int i=0; i < n; i++) p[i] = (ed += s[i]);
899+
R_do_slot_assign(pt, VECTOR_ELT(OBJ_CompressedList, 8), end);
900+
UNPROTECT(1);
901+
// output
902+
return rv_ans;
903+
}
904+
866905
/// get data from annotation/info/VARIABLE, TODO
867906
static SEXP get_info(CFileInfo &File, TVarMap &Var, void *param)
868907
{
@@ -969,11 +1008,14 @@ static SEXP get_info(CFileInfo &File, TVarMap &Var, void *param)
9691008
default:
9701009
throw ErrSeqArray("Not support data type for .padNA=TRUE.");
9711010
}
972-
} else if (P->tolist)
1011+
} else if (P->tolist) // TRUE or NA
9731012
{
9741013
// convert to a list
9751014
size_t d2 = (Var.NDim < 2) ? 1 : dimcnt[1];
976-
rv_ans = get_list(I32, val, d2, is_factor);
1015+
if ((P->tolist == TRUE) || (d2 != 1))
1016+
rv_ans = get_list(I32, val, d2, is_factor);
1017+
else
1018+
rv_ans = get_list2(I32, val, is_factor);
9771019
} else {
9781020
// create `list(length, data)`
9791021
rv_ans = PROTECT(NEW_LIST(2));
@@ -1369,9 +1411,9 @@ COREARRAY_DLL_EXPORT SEXP SEQ_GetData(SEXP gdsfile, SEXP var_name, SEXP UseRaw,
13691411
if (padNA == NA_LOGICAL)
13701412
Rf_error("'.padNA' must be TRUE or FALSE.");
13711413
// .tolist
1414+
if (!Rf_isLogical(ToList) || Rf_length(ToList)!=1)
1415+
Rf_error("'.tolist' must be TRUE, FALSE or NA.");
13721416
const int tolist = Rf_asLogical(ToList);
1373-
if (tolist == NA_LOGICAL)
1374-
Rf_error("'.tolist' must be TRUE or FALSE.");
13751417
// .envir
13761418
if (!Rf_isNull(Env))
13771419
{

src/SeqArray.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1567,9 +1567,10 @@ COREARRAY_DLL_EXPORT SEXP SEQ_ProgressAdd(SEXP ref, SEXP inc)
15671567
// ===========================================================
15681568

15691569
extern SEXP LANG_NEW_RLE;
1570+
extern SEXP OBJ_CompressedList;
15701571

15711572
COREARRAY_DLL_EXPORT SEXP SEQ_Pkg_Init(SEXP dim_name, SEXP proc_cnt,
1572-
SEXP proc_idx, SEXP lang_eval)
1573+
SEXP proc_idx, SEXP lang_eval, SEXP list_val)
15731574
{
15741575
// .dim_name
15751576
R_Geno_Dim2_Name = VECTOR_ELT(dim_name, 0);
@@ -1583,6 +1584,8 @@ COREARRAY_DLL_EXPORT SEXP SEQ_Pkg_Init(SEXP dim_name, SEXP proc_cnt,
15831584
R_Process_Index = INTEGER(proc_idx);
15841585
// lang_eval
15851586
LANG_NEW_RLE = VECTOR_ELT(lang_eval, 0);
1587+
// Compressed Lists in IRanges
1588+
OBJ_CompressedList = list_val;
15861589
// return
15871590
return R_NilValue;
15881591
}
@@ -1651,7 +1654,7 @@ COREARRAY_DLL_EXPORT void R_init_SeqArray(DllInfo *info)
16511654

16521655
static R_CallMethodDef callMethods[] =
16531656
{
1654-
CALL(SEQ_Pkg_Init, 4),
1657+
CALL(SEQ_Pkg_Init, 5),
16551658
CALL(SEQ_ExternalName0, 0), CALL(SEQ_ExternalName1, 1),
16561659
CALL(SEQ_ExternalName2, 2), CALL(SEQ_ExternalName3, 3),
16571660
CALL(SEQ_ExternalName4, 4), CALL(SEQ_ExternalName5, 5),

0 commit comments

Comments
 (0)