Skip to content

Commit

Permalink
combineFactorLevels(): hash instead of TRUELENGTH
Browse files Browse the repository at this point in the history
  • Loading branch information
aitap committed Dec 26, 2024
1 parent 32afbc0 commit fff6a48
Showing 1 changed file with 10 additions and 12 deletions.
22 changes: 10 additions & 12 deletions src/fmelt.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,8 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna
}

static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType, Rboolean * isRowOrdered)
// Finds unique levels directly in one pass with no need to create hash tables. Creates integer factor
// too in the same single pass. Previous version called factor(x, levels=unique) where x was type character
// and needed hash table.
// Finds unique levels directly in one pass. Creates integer factor too in the same single pass. Previous
// version called factor(x, levels=unique) where x was type character.
// TODO keep the original factor columns as factor and use new technique in rbindlist.c. The calling
// environments are a little difference hence postponed for now (e.g. rbindlist calls writeNA which
// a general purpose combiner would need to know how many to write)
Expand All @@ -404,8 +403,10 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType
SEXP *levelsRaw = (SEXP *)R_alloc(maxlevels, sizeof(SEXP)); // allocate for worst-case all-unique levels
int *ansd = INTEGER(ans);
const SEXP *targetd = STRING_PTR_RO(target);
savetl_init();
// no alloc or any fail point until savetl_end()
R_xlen_t hl = 0;
for (R_xlen_t i = 0; i < nitem; ++i)
hl += xlength(VECTOR_ELT(factorLevels, i));
hashtab * marks = hash_create(hl);
int nlevel=0;
for (int i=0; i<nitem; ++i) {
const SEXP this = VECTOR_ELT(factorLevels, i);
Expand All @@ -414,24 +415,21 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType
for (int k=0; k<thisn; ++k) {
SEXP s = thisd[k];
if (s==NA_STRING) continue; // NA shouldn't be in levels but remove it just in case
int tl = TRUELENGTH(s);
int tl = hash_lookup(marks, s, 0);
if (tl<0) continue; // seen this level before
if (tl>0) savetl(s);
SET_TRUELENGTH(s,-(++nlevel));
hash_set(marks,s,-(++nlevel));
levelsRaw[nlevel-1] = s;
}
}
for (int i=0; i<nrow; ++i) {
if (targetd[i]==NA_STRING) {
*ansd++ = NA_INTEGER;
} else {
int tl = TRUELENGTH(targetd[i]);
int tl = hash_lookup(marks,targetd[i],0);
*ansd++ = tl<0 ? -tl : NA_INTEGER;
}
}
for (int i=0; i<nlevel; ++i) SET_TRUELENGTH(levelsRaw[i], 0);
savetl_end();
// now after savetl_end, we can alloc (which might fail)
// there used to be savetl_end, after which we can alloc (which might fail)
SEXP levelsSxp;
setAttrib(ans, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nlevel));
for (int i=0; i<nlevel; ++i) SET_STRING_ELT(levelsSxp, i, levelsRaw[i]);
Expand Down

0 comments on commit fff6a48

Please sign in to comment.