From cb1688a9c6ea260ec088a38022cd2fb0f8d7673c Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Mon, 29 Sep 2025 13:42:06 +0400 Subject: [PATCH] Add p-value heuristics to significant terms aggregation (#5353) (cherry picked from commit 2cddbefb265a4cffdcff8c2b6c29561429d8a499) # Conflicts: # output/schema/schema.json --- output/openapi/elasticsearch-openapi.json | 16 ++++ .../elasticsearch-serverless-openapi.json | 16 ++++ output/schema/schema.json | 80 +++++++++++++++---- output/typescript/types.ts | 6 ++ specification/_types/aggregations/bucket.ts | 26 ++++++ 5 files changed, 127 insertions(+), 17 deletions(-) diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index 0412e03bbf..5d5e8b0a04 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -62600,6 +62600,9 @@ "script_heuristic": { "$ref": "#/components/schemas/_types.aggregations.ScriptedHeuristic" }, + "p_value": { + "$ref": "#/components/schemas/_types.aggregations.PValueHeuristic" + }, "shard_min_doc_count": { "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.", "type": "number" @@ -62678,6 +62681,19 @@ "script" ] }, + "_types.aggregations.PValueHeuristic": { + "type": "object", + "properties": { + "background_is_superset": { + "type": "boolean" + }, + "normalize_above": { + "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization", + "default": 0.0, + "type": "number" + } + } + }, "_types.aggregations.SignificantTextAggregation": { "allOf": [ { diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json index 83ebe3c7c6..1c3c0fb25d 100644 --- a/output/openapi/elasticsearch-serverless-openapi.json +++ b/output/openapi/elasticsearch-serverless-openapi.json @@ -39396,6 +39396,9 @@ "script_heuristic": { "$ref": "#/components/schemas/_types.aggregations.ScriptedHeuristic" }, + "p_value": { + "$ref": "#/components/schemas/_types.aggregations.PValueHeuristic" + }, "shard_min_doc_count": { "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.", "type": "number" @@ -39474,6 +39477,19 @@ "script" ] }, + "_types.aggregations.PValueHeuristic": { + "type": "object", + "properties": { + "background_is_superset": { + "type": "boolean" + }, + "normalize_above": { + "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization", + "default": 0.0, + "type": "number" + } + } + }, "_types.aggregations.SignificantTextAggregation": { "allOf": [ { diff --git a/output/schema/schema.json b/output/schema/schema.json index 0186b835b8..9a76f3ce98 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -59083,7 +59083,7 @@ "name": "AggregateOrder", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1052-L1054", + "specLocation": "_types/aggregations/bucket.ts#L1078-L1080", "type": { "kind": "union_of", "items": [ @@ -61676,7 +61676,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1117-L1183" + "specLocation": "_types/aggregations/bucket.ts#L1143-L1209" }, { "kind": "type_alias", @@ -61688,7 +61688,7 @@ "name": "CategorizeTextAnalyzer", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1185-L1188", + "specLocation": "_types/aggregations/bucket.ts#L1211-L1214", "type": { "kind": "union_of", "items": [ @@ -62364,7 +62364,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1190-L1194" + "specLocation": "_types/aggregations/bucket.ts#L1216-L1220" }, { "kind": "interface", @@ -63789,7 +63789,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1241-L1268" + "specLocation": "_types/aggregations/bucket.ts#L1267-L1294" }, { "kind": "interface", @@ -63889,7 +63889,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1227-L1239" + "specLocation": "_types/aggregations/bucket.ts#L1253-L1265" }, { "kind": "enum", @@ -65671,7 +65671,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1196-L1225" + "specLocation": "_types/aggregations/bucket.ts#L1222-L1251" }, { "kind": "interface", @@ -67415,6 +67415,40 @@ }, "specLocation": "_types/aggregations/pipeline.ts#L361-L387" }, + { + "kind": "interface", + "name": { + "name": "PValueHeuristic", + "namespace": "_types.aggregations" + }, + "properties": [ + { + "name": "background_is_superset", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + }, + { + "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization", + "name": "normalize_above", + "required": false, + "serverDefault": 0, + "type": { + "kind": "instance_of", + "type": { + "name": "long", + "namespace": "_types" + } + } + } + ], + "specLocation": "_types/aggregations/bucket.ts#L817-L831" + }, { "kind": "interface", "attachedBehaviors": [ @@ -68846,6 +68880,18 @@ } } }, + { + "description": "Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets.\n\nThe p-value is the probability of obtaining test results at least as extreme as\nthe results actually observed, under the assumption that the null hypothesis is\ncorrect. The p-value is calculated assuming that the foreground set and the\nbackground set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null\nhypothesis that the probabilities are the same.", + "name": "p_value", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "PValueHeuristic", + "namespace": "_types.aggregations" + } + } + }, { "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.", "name": "shard_min_doc_count", @@ -68883,7 +68929,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L817-L884" + "specLocation": "_types/aggregations/bucket.ts#L833-L910" }, { "kind": "interface", @@ -69147,7 +69193,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L886-L961" + "specLocation": "_types/aggregations/bucket.ts#L912-L987" }, { "kind": "interface", @@ -70569,7 +70615,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L963-L1031" + "specLocation": "_types/aggregations/bucket.ts#L989-L1057" }, { "kind": "enum", @@ -70587,7 +70633,7 @@ "name": "TermsAggregationCollectMode", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1056-L1065" + "specLocation": "_types/aggregations/bucket.ts#L1082-L1091" }, { "kind": "enum", @@ -70609,7 +70655,7 @@ "name": "TermsAggregationExecutionHint", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1067-L1072" + "specLocation": "_types/aggregations/bucket.ts#L1093-L1098" }, { "kind": "interface", @@ -70651,7 +70697,7 @@ "name": "TermsExclude", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1077-L1078", + "specLocation": "_types/aggregations/bucket.ts#L1103-L1104", "type": { "kind": "union_of", "items": [ @@ -70686,7 +70732,7 @@ "name": "TermsInclude", "namespace": "_types.aggregations" }, - "specLocation": "_types/aggregations/bucket.ts#L1074-L1075", + "specLocation": "_types/aggregations/bucket.ts#L1100-L1101", "type": { "kind": "union_of", "items": [ @@ -70749,7 +70795,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1080-L1089" + "specLocation": "_types/aggregations/bucket.ts#L1106-L1115" }, { "kind": "interface", @@ -70862,7 +70908,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1033-L1046" + "specLocation": "_types/aggregations/bucket.ts#L1059-L1072" }, { "kind": "interface", @@ -71593,7 +71639,7 @@ } } ], - "specLocation": "_types/aggregations/bucket.ts#L1091-L1115" + "specLocation": "_types/aggregations/bucket.ts#L1117-L1141" }, { "kind": "interface", diff --git a/output/typescript/types.ts b/output/typescript/types.ts index cb9d46d7bb..ad6d3c1736 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -4026,6 +4026,11 @@ export interface AggregationsNormalizeAggregation extends AggregationsPipelineAg export type AggregationsNormalizeMethod = 'rescale_0_1' | 'rescale_0_100' | 'percent_of_sum' | 'mean' | 'z-score' | 'softmax' +export interface AggregationsPValueHeuristic { + background_is_superset?: boolean + normalize_above?: long +} + export interface AggregationsParentAggregateKeys extends AggregationsSingleBucketAggregateBase { } export type AggregationsParentAggregate = AggregationsParentAggregateKeys @@ -4198,6 +4203,7 @@ export interface AggregationsSignificantTermsAggregation extends AggregationsBuc mutual_information?: AggregationsMutualInformationHeuristic percentage?: AggregationsPercentageScoreHeuristic script_heuristic?: AggregationsScriptedHeuristic + p_value?: AggregationsPValueHeuristic shard_min_doc_count?: long shard_size?: integer size?: integer diff --git a/specification/_types/aggregations/bucket.ts b/specification/_types/aggregations/bucket.ts index fb6f16abd3..9c8141c5d9 100644 --- a/specification/_types/aggregations/bucket.ts +++ b/specification/_types/aggregations/bucket.ts @@ -814,6 +814,22 @@ export class ScriptedHeuristic { script: Script } +export class PValueHeuristic { + /* + * Set to false to indicate that the background set does + * not contain the counts of the foreground set as they are filtered out. + * @server_default true + */ + background_is_superset?: boolean + /** + * Should the results be normalized when above the given value. + * Allows for consistent significance results at various scales. + * Note: `0` is a special value which means no normalization + * @server_default 0 + */ + normalize_above?: long +} + /** * @ext_doc_id search-aggregations-bucket-significanttext-aggregation */ @@ -867,6 +883,16 @@ export class SignificantTermsAggregation extends BucketAggregationBase { * Customized score, implemented via a script. */ script_heuristic?: ScriptedHeuristic + /** + * Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets. + * + * The p-value is the probability of obtaining test results at least as extreme as + * the results actually observed, under the assumption that the null hypothesis is + * correct. The p-value is calculated assuming that the foreground set and the + * background set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null + * hypothesis that the probabilities are the same. + */ + p_value?: PValueHeuristic /** * Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`. * Terms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.