Skip to content

Commit 8347b4c

Browse files
committed
Merge branch 'master' of github.com:GateNLP/gateplugin-LearningFramework
2 parents be6f17b + 77836a9 commit 8347b4c

File tree

5 files changed

+67
-44
lines changed

5 files changed

+67
-44
lines changed

creole.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ along with this software. If not, see <http://www.gnu.org/licenses/>.
2020
-->
2121
<CREOLE-DIRECTORY
2222
ID="gate.LearningFramework"
23-
VERSION="3.5.2"
23+
VERSION="3.5.3"
2424
DESCRIPTION="Learning Framework"
2525
HELPURL="https://github.com/GateNLP/gateplugin-LearningFramework/wiki"
2626
>

src/gate/plugin/learningframework/data/CorpusRepresentationMalletTarget.java

+1
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ static Instance extractIndependentFeaturesHelper(
165165
Pipe pipe) {
166166

167167
AugmentableFeatureVector afv = new AugmentableFeatureVector(pipe.getDataAlphabet());
168+
// Constructor parms: data, target, name, source
168169
Instance inst = new Instance(afv, null, null, null);
169170
for(FeatureSpecAttribute attr : featureInfo.getAttributes()) {
170171
FeatureExtraction.extractFeature(inst, attr, inputAS, instanceAnnotation);

src/gate/plugin/learningframework/engines/EngineMalletClass.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ public void initializeAlgorithm(Algorithm algorithm, String parms) {
122122
// there are parameters, so if it is one of the algorithms were we support setting
123123
// a parameter do this
124124
if (algorithm.equals(AlgorithmClassification.MALLET_CL_C45)) {
125-
Parms ps = new Parms(parms, "m:maxDepth:i", "p:prune:b","n:minNumInsts:i");
125+
Parms ps = new Parms(parms, "m:maxDepth:i", "p:prune:B","n:minNumInsts:i");
126126
int maxDepth = (int)ps.getValueOrElse("maxDepth", 0);
127127
int minNumInsts = (int)ps.getValueOrElse("minNumInsts", 2);
128128
boolean prune = (boolean)ps.getValueOrElse("prune",true);

src/gate/plugin/learningframework/features/FeatureExtraction.java

+63-42
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ private static void extractFeatureWorker(
314314
// construct the featureName name and set to 1.0
315315
// however, only add the featureName if the featureName alphabet is allowed to grow.
316316
String fname = internalFeatureNamePrefix;
317-
addToFeatureVector(fv, fname, 1.0);
317+
setInFeatureVector(fv, fname, 1.0);
318318
} else {
319319
// First get the value inputAS an Object, if there is no value, we have an Object that is null
320320
// If the sourceAnnotation is null, we already did not find the source at all,
@@ -344,22 +344,22 @@ private static void extractFeatureWorker(
344344
Iterable iterable = (Iterable) valObj;
345345
for (Object obj : iterable) {
346346
String val = obj.toString();
347-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
347+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
348348
}
349349
} else if (valObj instanceof Map) {
350350
Map map = (Map) valObj;
351351
for (Object key : map.keySet()) {
352352
Object mapval = map.get(key);
353353
String val = key.toString() + "=" + mapval.toString();
354-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
354+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
355355
}
356356
} else if (valObj instanceof Object[]) {
357357
for (Object obj : ((Object[]) valObj)) {
358-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + obj.toString(), 1.0);
358+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + obj.toString(), 1.0);
359359
}
360360
} else if (valObj instanceof int[]) {
361361
for (int intval : ((int[]) valObj)) {
362-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + intval, 1.0);
362+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + intval, 1.0);
363363
}
364364
// TODO: other array types??
365365
} else {
@@ -371,18 +371,18 @@ private static void extractFeatureWorker(
371371
for (String v : vals) {
372372
// NOTE: we automatically remove any empty elements here
373373
if (!v.trim().isEmpty()) {
374-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + v.trim(), 1.0);
374+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + v.trim(), 1.0);
375375
}
376376
}
377377
} else // just take the value as is.
378378
// Only in this case we allow for optionally getting the score from a different
379379
// feature of the same annotation we got the value from.
380380
if (featureName4Value.isEmpty()) {
381-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
381+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, 1.0);
382382
} else {
383383
// NOTE: sourceAnnotation should always ne non-null here since valObj is non-null
384384
double score = gate.plugin.learningframework.LFUtils.anyToDoubleOrElse(sourceAnnotation.getFeatures().get(featureName4Value), 1.0);
385-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, score);
385+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + val, score);
386386
}
387387
}
388388
} else {
@@ -397,7 +397,7 @@ private static void extractFeatureWorker(
397397
case zero_value: // we treat this identical to keep: no feature set
398398
break;
399399
case special_value: // we use the predefined special value
400-
addToFeatureVector(fv, internalFeatureNamePrefix + VALSEP + MVVALUE, 1.0);
400+
setInFeatureVector(fv, internalFeatureNamePrefix + VALSEP + MVVALUE, 1.0);
401401
break;
402402
default:
403403
throw new NotImplementedException("MV-Handling");
@@ -416,12 +416,12 @@ private static void extractFeatureWorker(
416416
if (alphabet.contains(val)) {
417417
// add the featureName, using the value we have stored for it, but only if the featureName
418418
// itself can be added
419-
addToFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(val));
419+
setInFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(val));
420420
} else // we have not seen this value: if the alphabet is allowed to grow add it and
421421
// then try to add the featureName, otherwise, do nothing
422422
if (!alphabet.growthStopped()) {
423423
// the lookupIndex method automatically adds the value if it is not there yet
424-
addToFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(val));
424+
setInFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(val));
425425
} else {
426426
//System.out.println("DEBUG: number, growStopped");
427427
}
@@ -433,21 +433,21 @@ private static void extractFeatureWorker(
433433
//System.out.println("DEBUG: other, mv, setProp");
434434
break;
435435
case keep:
436-
addToFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
436+
setInFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
437437
break;
438438
case zero_value: // use the "special_value"
439-
addToFeatureVector(fv, internalFeatureNamePrefix, 0.0);
439+
setInFeatureVector(fv, internalFeatureNamePrefix, 0.0);
440440
String val = MVVALUE;
441441
if (alphabet.contains(val)) {
442-
addToFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(MVVALUE));
442+
setInFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(MVVALUE));
443443
} else if (!alphabet.growthStopped()) {
444-
addToFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(MVVALUE));
444+
setInFeatureVector(fv, internalFeatureNamePrefix, alphabet.lookupIndex(MVVALUE));
445445
} else {
446446
//System.out.println("DEBUG: number, growStopped");
447447
}
448448
break;
449449
case special_value: // we use the special value -1.0 which should get handled by Mallet somehow
450-
addToFeatureVector(fv, internalFeatureNamePrefix, -1.0);
450+
setInFeatureVector(fv, internalFeatureNamePrefix, -1.0);
451451
break;
452452
default:
453453
throw new NotImplementedException("MV-Handling");
@@ -462,27 +462,27 @@ private static void extractFeatureWorker(
462462
double val = 0.0;
463463
if (valObj instanceof Number) {
464464
val = ((Number) valObj).doubleValue();
465-
addToFeatureVector(fv, internalFeatureNamePrefix, val);
465+
setInFeatureVector(fv, internalFeatureNamePrefix, val);
466466
} else if (valObj instanceof Boolean) {
467467
if ((Boolean) valObj) {
468468
val = 1.0;
469469
} else {
470470
val = 0.0;
471471
}
472-
addToFeatureVector(fv, internalFeatureNamePrefix, val);
472+
setInFeatureVector(fv, internalFeatureNamePrefix, val);
473473
} else if (valObj instanceof double[]) {
474474
// create one feature for each entry in the array
475475
int i = 0;
476476
for (double el : ((double[]) valObj)) {
477-
addToFeatureVector(fv, internalFeatureNamePrefix + ELEMSEP + i, el);
477+
setInFeatureVector(fv, internalFeatureNamePrefix + ELEMSEP + i, el);
478478
i++;
479479
}
480480
} else if (valObj instanceof Iterable) {
481481
int i = 0;
482482
for (Object el : (Iterable) valObj) {
483483
val = LFUtils.anyToDoubleOrElse(el, 0.0);
484484
if (val != 0.0) {
485-
addToFeatureVector(fv, internalFeatureNamePrefix + ELEMSEP + i, LFUtils.anyToDoubleOrElse(el, val));
485+
setInFeatureVector(fv, internalFeatureNamePrefix + ELEMSEP + i, LFUtils.anyToDoubleOrElse(el, val));
486486
}
487487
i++;
488488
}
@@ -497,7 +497,7 @@ private static void extractFeatureWorker(
497497
+ // take it from the annotation, annType can be empty!
498498
" at offset " + gate.Utils.start(sourceAnnotation) + " in document " + doc.getName());
499499
}
500-
addToFeatureVector(fv, internalFeatureNamePrefix, val);
500+
setInFeatureVector(fv, internalFeatureNamePrefix, val);
501501
}
502502
//System.err.println("DEBUG: for fname="+featureName+",dt="+dt+", valObj="+valObj+", fv="+fv.numLocations());
503503
} else {
@@ -508,14 +508,14 @@ private static void extractFeatureWorker(
508508
//System.out.println("DEBUG: numeric, mv, setProp");
509509
break;
510510
case keep: // for this kind of codeas, we use the value NaN
511-
addToFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
511+
setInFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
512512
break;
513513
case zero_value: // use the first value, does not make much sense really, but ...
514514
// TODO: document that this combination should be avoided, probably
515-
addToFeatureVector(fv, internalFeatureNamePrefix, 0.0);
515+
setInFeatureVector(fv, internalFeatureNamePrefix, 0.0);
516516
break;
517517
case special_value: // we use the special value -1.0 which should get handled by Mallet somehow
518-
addToFeatureVector(fv, internalFeatureNamePrefix, -1.0);
518+
setInFeatureVector(fv, internalFeatureNamePrefix, -1.0);
519519
break;
520520
default:
521521
throw new NotImplementedException("MV-Handling");
@@ -547,7 +547,7 @@ private static void extractFeatureWorker(
547547
" at offset " + gate.Utils.start(sourceAnnotation) + " in document " + doc.getName());
548548
}
549549
}
550-
addToFeatureVector(fv, internalFeatureNamePrefix, val);
550+
setInFeatureVector(fv, internalFeatureNamePrefix, val);
551551
} else {
552552
// we have a missing boolean value
553553
switch (mvt) {
@@ -556,14 +556,14 @@ private static void extractFeatureWorker(
556556
inst.setProperty(PROP_IGNORE_HAS_MV, true);
557557
break;
558558
case keep: // for this kind of codeas, we use the value NaN
559-
addToFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
559+
setInFeatureVector(fv, internalFeatureNamePrefix, Double.NaN);
560560
break;
561561
case zero_value: // Use zero which will make false identical to missing
562562
// and work well with sparse vectors
563-
addToFeatureVector(fv, internalFeatureNamePrefix, 0.0);
563+
setInFeatureVector(fv, internalFeatureNamePrefix, 0.0);
564564
break;
565565
case special_value: // we use the special value -1.0 which should get handled by Mallet somehow
566-
addToFeatureVector(fv, internalFeatureNamePrefix, 0.5);
566+
setInFeatureVector(fv, internalFeatureNamePrefix, 0.5);
567567
break;
568568
default:
569569
throw new NotImplementedException("MV-Handling");
@@ -693,15 +693,20 @@ private static void extractFeature(
693693
prefix = ng.name;
694694
}
695695
prefix = prefix + NAMESEP + "N" + number;
696-
// NOTE: if we have a featureName4Value set, then we set the feature value to
697-
// what we have calculated, otherwise we add one for each time the ngram occurs
698-
// within the span.
699-
if (featureName4Value.isEmpty()) {
700-
addToFeatureVector(fv, prefix + VALSEP + ngram, score);
701-
} else {
702-
setFeatureVector(fv, prefix + VALSEP + ngram, score);
703-
}
696+
// NOTE: for now, we always add to any existing value of the feature vector we
697+
// may already have. That way, if some ngram occurs multiple times, we use the
698+
// sum its scores (and the score either is just 1.0 or whatever we got from using
699+
// the featureName4Value value).
700+
accumulateInFeatureVector(fv, prefix + VALSEP + ngram, score);
701+
// NOTE: previously, we only accumulated if there was no weight feature, otherwise
702+
// the weight was directly used without accumulation
703+
//if (featureName4Value.isEmpty()) {
704+
// accumulateInFeatureVector(fv, prefix + VALSEP + ngram, score);
705+
//} else {
706+
// setInFeatureVector(fv, prefix + VALSEP + ngram, score);
707+
//}
704708
}
709+
//System.err.println("DEBUG: Vector after adding feature "+ng+" is now "+fv);
705710
} // extractFeature(NGram)
706711

707712
private static void extractFeature(
@@ -1079,28 +1084,44 @@ public static FeatureSpecAttribute lookupAttributeForFeatureName(List<FeatureSpe
10791084
/// HELPER AND UTILITY METHODS
10801085
///=======================================
10811086
/**
1082-
* Same inputAS the method, but makes sure a non-growable Alphabet is considered.
1087+
* Set a feature in the feature vector, to the given value.
1088+
* However, if growth is stopped, do not set the feature if the key is not known.
10831089
*
1090+
* This method assumes that the key for this feature vector is only set once, if it
1091+
* is set another time for the same feature vector, any old value is overridden!
1092+
*
10841093
* @param fv
10851094
* @param key
10861095
* @param val
10871096
*/
1088-
private static void addToFeatureVector(AugmentableFeatureVector fv, Object key, double val) {
1097+
private static void setInFeatureVector(AugmentableFeatureVector fv, Object key, double val) {
10891098
Alphabet a = fv.getAlphabet();
10901099
if (!a.contains(key) && a.growthStopped()) {
10911100
//System.err.println("DEBUG: GROWTH STOPPED! key="+key+",a="+a);
10921101
return;
10931102
}
1094-
fv.add(key, val);
1103+
if(fv.contains(key)) {
1104+
System.err.println("LF DEBUG: setting/overriding a value where there is already one! key="+key);
1105+
fv.setValue(a.lookupIndex(key), val);
1106+
} else {
1107+
fv.add(key, val);
1108+
}
10951109
}
10961110

1097-
private static void setFeatureVector(AugmentableFeatureVector fv, Object key, double val) {
1111+
private static void accumulateInFeatureVector(AugmentableFeatureVector fv, Object key, double val) {
10981112
Alphabet a = fv.getAlphabet();
10991113
if (!a.contains(key) && a.growthStopped()) {
11001114
return;
11011115
}
1102-
int index = a.lookupIndex(key);
1103-
fv.setValue(index, val);
1116+
fv.add(key,val);
1117+
// Instead of the previous statement the following was used for debugging:
1118+
//if(fv.contains(key)) {
1119+
// fv.add(key,val);
1120+
//System.err.println("DEBUG accumulate: adding to existing: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1121+
//} else {
1122+
// fv.add(key,val);
1123+
//System.err.println("DEBUG accumulate: creating new: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1124+
//}
11041125
}
11051126

11061127
}

src/gate/plugin/learningframework/features/FeatureSpecNgram.java

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public String toString() {
5656
return "NgramAttribute(name="+name+
5757
",type="+annType+
5858
",feature="+feature+
59+
",featureName4Value="+featureName4Value+
5960
",number="+number;
6061
}
6162

0 commit comments

Comments
 (0)