@@ -314,7 +314,7 @@ private static void extractFeatureWorker(
314
314
// construct the featureName name and set to 1.0
315
315
// however, only add the featureName if the featureName alphabet is allowed to grow.
316
316
String fname = internalFeatureNamePrefix ;
317
- addToFeatureVector (fv , fname , 1.0 );
317
+ setInFeatureVector (fv , fname , 1.0 );
318
318
} else {
319
319
// First get the value inputAS an Object, if there is no value, we have an Object that is null
320
320
// If the sourceAnnotation is null, we already did not find the source at all,
@@ -344,22 +344,22 @@ private static void extractFeatureWorker(
344
344
Iterable iterable = (Iterable ) valObj ;
345
345
for (Object obj : iterable ) {
346
346
String val = obj .toString ();
347
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
347
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
348
348
}
349
349
} else if (valObj instanceof Map ) {
350
350
Map map = (Map ) valObj ;
351
351
for (Object key : map .keySet ()) {
352
352
Object mapval = map .get (key );
353
353
String val = key .toString () + "=" + mapval .toString ();
354
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
354
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
355
355
}
356
356
} else if (valObj instanceof Object []) {
357
357
for (Object obj : ((Object []) valObj )) {
358
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + obj .toString (), 1.0 );
358
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + obj .toString (), 1.0 );
359
359
}
360
360
} else if (valObj instanceof int []) {
361
361
for (int intval : ((int []) valObj )) {
362
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + intval , 1.0 );
362
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + intval , 1.0 );
363
363
}
364
364
// TODO: other array types??
365
365
} else {
@@ -371,18 +371,18 @@ private static void extractFeatureWorker(
371
371
for (String v : vals ) {
372
372
// NOTE: we automatically remove any empty elements here
373
373
if (!v .trim ().isEmpty ()) {
374
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + v .trim (), 1.0 );
374
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + v .trim (), 1.0 );
375
375
}
376
376
}
377
377
} else // just take the value as is.
378
378
// Only in this case we allow for optionally getting the score from a different
379
379
// feature of the same annotation we got the value from.
380
380
if (featureName4Value .isEmpty ()) {
381
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
381
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
382
382
} else {
383
383
// NOTE: sourceAnnotation should always ne non-null here since valObj is non-null
384
384
double score = gate .plugin .learningframework .LFUtils .anyToDoubleOrElse (sourceAnnotation .getFeatures ().get (featureName4Value ), 1.0 );
385
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , score );
385
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , score );
386
386
}
387
387
}
388
388
} else {
@@ -397,7 +397,7 @@ private static void extractFeatureWorker(
397
397
case zero_value : // we treat this identical to keep: no feature set
398
398
break ;
399
399
case special_value : // we use the predefined special value
400
- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + MVVALUE , 1.0 );
400
+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + MVVALUE , 1.0 );
401
401
break ;
402
402
default :
403
403
throw new NotImplementedException ("MV-Handling" );
@@ -416,12 +416,12 @@ private static void extractFeatureWorker(
416
416
if (alphabet .contains (val )) {
417
417
// add the featureName, using the value we have stored for it, but only if the featureName
418
418
// itself can be added
419
- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
419
+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
420
420
} else // we have not seen this value: if the alphabet is allowed to grow add it and
421
421
// then try to add the featureName, otherwise, do nothing
422
422
if (!alphabet .growthStopped ()) {
423
423
// the lookupIndex method automatically adds the value if it is not there yet
424
- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
424
+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
425
425
} else {
426
426
//System.out.println("DEBUG: number, growStopped");
427
427
}
@@ -433,21 +433,21 @@ private static void extractFeatureWorker(
433
433
//System.out.println("DEBUG: other, mv, setProp");
434
434
break ;
435
435
case keep :
436
- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
436
+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
437
437
break ;
438
438
case zero_value : // use the "special_value"
439
- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
439
+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
440
440
String val = MVVALUE ;
441
441
if (alphabet .contains (val )) {
442
- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
442
+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
443
443
} else if (!alphabet .growthStopped ()) {
444
- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
444
+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
445
445
} else {
446
446
//System.out.println("DEBUG: number, growStopped");
447
447
}
448
448
break ;
449
449
case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
450
- addToFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
450
+ setInFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
451
451
break ;
452
452
default :
453
453
throw new NotImplementedException ("MV-Handling" );
@@ -462,27 +462,27 @@ private static void extractFeatureWorker(
462
462
double val = 0.0 ;
463
463
if (valObj instanceof Number ) {
464
464
val = ((Number ) valObj ).doubleValue ();
465
- addToFeatureVector (fv , internalFeatureNamePrefix , val );
465
+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
466
466
} else if (valObj instanceof Boolean ) {
467
467
if ((Boolean ) valObj ) {
468
468
val = 1.0 ;
469
469
} else {
470
470
val = 0.0 ;
471
471
}
472
- addToFeatureVector (fv , internalFeatureNamePrefix , val );
472
+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
473
473
} else if (valObj instanceof double []) {
474
474
// create one feature for each entry in the array
475
475
int i = 0 ;
476
476
for (double el : ((double []) valObj )) {
477
- addToFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , el );
477
+ setInFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , el );
478
478
i ++;
479
479
}
480
480
} else if (valObj instanceof Iterable ) {
481
481
int i = 0 ;
482
482
for (Object el : (Iterable ) valObj ) {
483
483
val = LFUtils .anyToDoubleOrElse (el , 0.0 );
484
484
if (val != 0.0 ) {
485
- addToFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , LFUtils .anyToDoubleOrElse (el , val ));
485
+ setInFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , LFUtils .anyToDoubleOrElse (el , val ));
486
486
}
487
487
i ++;
488
488
}
@@ -497,7 +497,7 @@ private static void extractFeatureWorker(
497
497
+ // take it from the annotation, annType can be empty!
498
498
" at offset " + gate .Utils .start (sourceAnnotation ) + " in document " + doc .getName ());
499
499
}
500
- addToFeatureVector (fv , internalFeatureNamePrefix , val );
500
+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
501
501
}
502
502
//System.err.println("DEBUG: for fname="+featureName+",dt="+dt+", valObj="+valObj+", fv="+fv.numLocations());
503
503
} else {
@@ -508,14 +508,14 @@ private static void extractFeatureWorker(
508
508
//System.out.println("DEBUG: numeric, mv, setProp");
509
509
break ;
510
510
case keep : // for this kind of codeas, we use the value NaN
511
- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
511
+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
512
512
break ;
513
513
case zero_value : // use the first value, does not make much sense really, but ...
514
514
// TODO: document that this combination should be avoided, probably
515
- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
515
+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
516
516
break ;
517
517
case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
518
- addToFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
518
+ setInFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
519
519
break ;
520
520
default :
521
521
throw new NotImplementedException ("MV-Handling" );
@@ -547,7 +547,7 @@ private static void extractFeatureWorker(
547
547
" at offset " + gate .Utils .start (sourceAnnotation ) + " in document " + doc .getName ());
548
548
}
549
549
}
550
- addToFeatureVector (fv , internalFeatureNamePrefix , val );
550
+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
551
551
} else {
552
552
// we have a missing boolean value
553
553
switch (mvt ) {
@@ -556,14 +556,14 @@ private static void extractFeatureWorker(
556
556
inst .setProperty (PROP_IGNORE_HAS_MV , true );
557
557
break ;
558
558
case keep : // for this kind of codeas, we use the value NaN
559
- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
559
+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
560
560
break ;
561
561
case zero_value : // Use zero which will make false identical to missing
562
562
// and work well with sparse vectors
563
- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
563
+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
564
564
break ;
565
565
case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
566
- addToFeatureVector (fv , internalFeatureNamePrefix , 0.5 );
566
+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.5 );
567
567
break ;
568
568
default :
569
569
throw new NotImplementedException ("MV-Handling" );
@@ -693,15 +693,20 @@ private static void extractFeature(
693
693
prefix = ng .name ;
694
694
}
695
695
prefix = prefix + NAMESEP + "N" + number ;
696
- // NOTE: if we have a featureName4Value set, then we set the feature value to
697
- // what we have calculated, otherwise we add one for each time the ngram occurs
698
- // within the span.
699
- if (featureName4Value .isEmpty ()) {
700
- addToFeatureVector (fv , prefix + VALSEP + ngram , score );
701
- } else {
702
- setFeatureVector (fv , prefix + VALSEP + ngram , score );
703
- }
696
+ // NOTE: for now, we always add to any existing value of the feature vector we
697
+ // may already have. That way, if some ngram occurs multiple times, we use the
698
+ // sum its scores (and the score either is just 1.0 or whatever we got from using
699
+ // the featureName4Value value).
700
+ accumulateInFeatureVector (fv , prefix + VALSEP + ngram , score );
701
+ // NOTE: previously, we only accumulated if there was no weight feature, otherwise
702
+ // the weight was directly used without accumulation
703
+ //if (featureName4Value.isEmpty()) {
704
+ // accumulateInFeatureVector(fv, prefix + VALSEP + ngram, score);
705
+ //} else {
706
+ // setInFeatureVector(fv, prefix + VALSEP + ngram, score);
707
+ //}
704
708
}
709
+ //System.err.println("DEBUG: Vector after adding feature "+ng+" is now "+fv);
705
710
} // extractFeature(NGram)
706
711
707
712
private static void extractFeature (
@@ -1079,28 +1084,44 @@ public static FeatureSpecAttribute lookupAttributeForFeatureName(List<FeatureSpe
1079
1084
/// HELPER AND UTILITY METHODS
1080
1085
///=======================================
1081
1086
/**
1082
- * Same inputAS the method, but makes sure a non-growable Alphabet is considered.
1087
+ * Set a feature in the feature vector, to the given value.
1088
+ * However, if growth is stopped, do not set the feature if the key is not known.
1083
1089
*
1090
+ * This method assumes that the key for this feature vector is only set once, if it
1091
+ * is set another time for the same feature vector, any old value is overridden!
1092
+ *
1084
1093
* @param fv
1085
1094
* @param key
1086
1095
* @param val
1087
1096
*/
1088
- private static void addToFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1097
+ private static void setInFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1089
1098
Alphabet a = fv .getAlphabet ();
1090
1099
if (!a .contains (key ) && a .growthStopped ()) {
1091
1100
//System.err.println("DEBUG: GROWTH STOPPED! key="+key+",a="+a);
1092
1101
return ;
1093
1102
}
1094
- fv .add (key , val );
1103
+ if (fv .contains (key )) {
1104
+ System .err .println ("LF DEBUG: setting/overriding a value where there is already one! key=" +key );
1105
+ fv .setValue (a .lookupIndex (key ), val );
1106
+ } else {
1107
+ fv .add (key , val );
1108
+ }
1095
1109
}
1096
1110
1097
- private static void setFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1111
+ private static void accumulateInFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1098
1112
Alphabet a = fv .getAlphabet ();
1099
1113
if (!a .contains (key ) && a .growthStopped ()) {
1100
1114
return ;
1101
1115
}
1102
- int index = a .lookupIndex (key );
1103
- fv .setValue (index , val );
1116
+ fv .add (key ,val );
1117
+ // Instead of the previous statement the following was used for debugging:
1118
+ //if(fv.contains(key)) {
1119
+ // fv.add(key,val);
1120
+ //System.err.println("DEBUG accumulate: adding to existing: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1121
+ //} else {
1122
+ // fv.add(key,val);
1123
+ //System.err.println("DEBUG accumulate: creating new: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1124
+ //}
1104
1125
}
1105
1126
1106
1127
}
0 commit comments