From cb1688a9c6ea260ec088a38022cd2fb0f8d7673c Mon Sep 17 00:00:00 2001
From: Quentin Pradet <quentin.pradet@elastic.co>
Date: Mon, 29 Sep 2025 13:42:06 +0400
Subject: [PATCH] Add p-value heuristics to significant terms aggregation
 (#5353)

(cherry picked from commit 2cddbefb265a4cffdcff8c2b6c29561429d8a499)

# Conflicts:
#	output/schema/schema.json
---
 output/openapi/elasticsearch-openapi.json     | 16 ++++
 .../elasticsearch-serverless-openapi.json     | 16 ++++
 output/schema/schema.json                     | 80 +++++++++++++++----
 output/typescript/types.ts                    |  6 ++
 specification/_types/aggregations/bucket.ts   | 26 ++++++
 5 files changed, 127 insertions(+), 17 deletions(-)

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index 0412e03bbf..5d5e8b0a04 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -62600,6 +62600,9 @@
               "script_heuristic": {
                 "$ref": "#/components/schemas/_types.aggregations.ScriptedHeuristic"
               },
+              "p_value": {
+                "$ref": "#/components/schemas/_types.aggregations.PValueHeuristic"
+              },
               "shard_min_doc_count": {
                 "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.",
                 "type": "number"
@@ -62678,6 +62681,19 @@
           "script"
         ]
       },
+      "_types.aggregations.PValueHeuristic": {
+        "type": "object",
+        "properties": {
+          "background_is_superset": {
+            "type": "boolean"
+          },
+          "normalize_above": {
+            "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization",
+            "default": 0.0,
+            "type": "number"
+          }
+        }
+      },
       "_types.aggregations.SignificantTextAggregation": {
         "allOf": [
           {
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index 83ebe3c7c6..1c3c0fb25d 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -39396,6 +39396,9 @@
               "script_heuristic": {
                 "$ref": "#/components/schemas/_types.aggregations.ScriptedHeuristic"
               },
+              "p_value": {
+                "$ref": "#/components/schemas/_types.aggregations.PValueHeuristic"
+              },
               "shard_min_doc_count": {
                 "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.",
                 "type": "number"
@@ -39474,6 +39477,19 @@
           "script"
         ]
       },
+      "_types.aggregations.PValueHeuristic": {
+        "type": "object",
+        "properties": {
+          "background_is_superset": {
+            "type": "boolean"
+          },
+          "normalize_above": {
+            "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization",
+            "default": 0.0,
+            "type": "number"
+          }
+        }
+      },
       "_types.aggregations.SignificantTextAggregation": {
         "allOf": [
           {
diff --git a/output/schema/schema.json b/output/schema/schema.json
index 0186b835b8..9a76f3ce98 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -59083,7 +59083,7 @@
         "name": "AggregateOrder",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1052-L1054",
+      "specLocation": "_types/aggregations/bucket.ts#L1078-L1080",
       "type": {
         "kind": "union_of",
         "items": [
@@ -61676,7 +61676,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1117-L1183"
+      "specLocation": "_types/aggregations/bucket.ts#L1143-L1209"
     },
     {
       "kind": "type_alias",
@@ -61688,7 +61688,7 @@
         "name": "CategorizeTextAnalyzer",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1185-L1188",
+      "specLocation": "_types/aggregations/bucket.ts#L1211-L1214",
       "type": {
         "kind": "union_of",
         "items": [
@@ -62364,7 +62364,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1190-L1194"
+      "specLocation": "_types/aggregations/bucket.ts#L1216-L1220"
     },
     {
       "kind": "interface",
@@ -63789,7 +63789,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1241-L1268"
+      "specLocation": "_types/aggregations/bucket.ts#L1267-L1294"
     },
     {
       "kind": "interface",
@@ -63889,7 +63889,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1227-L1239"
+      "specLocation": "_types/aggregations/bucket.ts#L1253-L1265"
     },
     {
       "kind": "enum",
@@ -65671,7 +65671,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1196-L1225"
+      "specLocation": "_types/aggregations/bucket.ts#L1222-L1251"
     },
     {
       "kind": "interface",
@@ -67415,6 +67415,40 @@
       },
       "specLocation": "_types/aggregations/pipeline.ts#L361-L387"
     },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "PValueHeuristic",
+        "namespace": "_types.aggregations"
+      },
+      "properties": [
+        {
+          "name": "background_is_superset",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "boolean",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "Should the results be normalized when above the given value.\nAllows for consistent significance results at various scales.\nNote: `0` is a special value which means no normalization",
+          "name": "normalize_above",
+          "required": false,
+          "serverDefault": 0,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "long",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "_types/aggregations/bucket.ts#L817-L831"
+    },
     {
       "kind": "interface",
       "attachedBehaviors": [
@@ -68846,6 +68880,18 @@
             }
           }
         },
+        {
+          "description": "Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets.\n\nThe p-value is the probability of obtaining test results at least as extreme as\nthe results actually observed, under the assumption that the null hypothesis is\ncorrect. The p-value is calculated assuming that the foreground set and the\nbackground set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null\nhypothesis that the probabilities are the same.",
+          "name": "p_value",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "PValueHeuristic",
+              "namespace": "_types.aggregations"
+            }
+          }
+        },
         {
           "description": "Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.\nTerms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.",
           "name": "shard_min_doc_count",
@@ -68883,7 +68929,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L817-L884"
+      "specLocation": "_types/aggregations/bucket.ts#L833-L910"
     },
     {
       "kind": "interface",
@@ -69147,7 +69193,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L886-L961"
+      "specLocation": "_types/aggregations/bucket.ts#L912-L987"
     },
     {
       "kind": "interface",
@@ -70569,7 +70615,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L963-L1031"
+      "specLocation": "_types/aggregations/bucket.ts#L989-L1057"
     },
     {
       "kind": "enum",
@@ -70587,7 +70633,7 @@
         "name": "TermsAggregationCollectMode",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1056-L1065"
+      "specLocation": "_types/aggregations/bucket.ts#L1082-L1091"
     },
     {
       "kind": "enum",
@@ -70609,7 +70655,7 @@
         "name": "TermsAggregationExecutionHint",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1067-L1072"
+      "specLocation": "_types/aggregations/bucket.ts#L1093-L1098"
     },
     {
       "kind": "interface",
@@ -70651,7 +70697,7 @@
         "name": "TermsExclude",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1077-L1078",
+      "specLocation": "_types/aggregations/bucket.ts#L1103-L1104",
       "type": {
         "kind": "union_of",
         "items": [
@@ -70686,7 +70732,7 @@
         "name": "TermsInclude",
         "namespace": "_types.aggregations"
       },
-      "specLocation": "_types/aggregations/bucket.ts#L1074-L1075",
+      "specLocation": "_types/aggregations/bucket.ts#L1100-L1101",
       "type": {
         "kind": "union_of",
         "items": [
@@ -70749,7 +70795,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1080-L1089"
+      "specLocation": "_types/aggregations/bucket.ts#L1106-L1115"
     },
     {
       "kind": "interface",
@@ -70862,7 +70908,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1033-L1046"
+      "specLocation": "_types/aggregations/bucket.ts#L1059-L1072"
     },
     {
       "kind": "interface",
@@ -71593,7 +71639,7 @@
           }
         }
       ],
-      "specLocation": "_types/aggregations/bucket.ts#L1091-L1115"
+      "specLocation": "_types/aggregations/bucket.ts#L1117-L1141"
     },
     {
       "kind": "interface",
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index cb9d46d7bb..ad6d3c1736 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -4026,6 +4026,11 @@ export interface AggregationsNormalizeAggregation extends AggregationsPipelineAg
 
 export type AggregationsNormalizeMethod = 'rescale_0_1' | 'rescale_0_100' | 'percent_of_sum' | 'mean' | 'z-score' | 'softmax'
 
+export interface AggregationsPValueHeuristic {
+  background_is_superset?: boolean
+  normalize_above?: long
+}
+
 export interface AggregationsParentAggregateKeys extends AggregationsSingleBucketAggregateBase {
 }
 export type AggregationsParentAggregate = AggregationsParentAggregateKeys
@@ -4198,6 +4203,7 @@ export interface AggregationsSignificantTermsAggregation extends AggregationsBuc
   mutual_information?: AggregationsMutualInformationHeuristic
   percentage?: AggregationsPercentageScoreHeuristic
   script_heuristic?: AggregationsScriptedHeuristic
+  p_value?: AggregationsPValueHeuristic
   shard_min_doc_count?: long
   shard_size?: integer
   size?: integer
diff --git a/specification/_types/aggregations/bucket.ts b/specification/_types/aggregations/bucket.ts
index fb6f16abd3..9c8141c5d9 100644
--- a/specification/_types/aggregations/bucket.ts
+++ b/specification/_types/aggregations/bucket.ts
@@ -814,6 +814,22 @@ export class ScriptedHeuristic {
   script: Script
 }
 
+export class PValueHeuristic {
+  /*
+   * Set to false to indicate that the background set does
+   * not contain the counts of the foreground set as they are filtered out.
+   * @server_default true
+   */
+  background_is_superset?: boolean
+  /**
+   * Should the results be normalized when above the given value.
+   * Allows for consistent significance results at various scales.
+   * Note: `0` is a special value which means no normalization
+   * @server_default 0
+   */
+  normalize_above?: long
+}
+
 /**
  * @ext_doc_id search-aggregations-bucket-significanttext-aggregation
  */
@@ -867,6 +883,16 @@ export class SignificantTermsAggregation extends BucketAggregationBase {
    * Customized score, implemented via a script.
    */
   script_heuristic?: ScriptedHeuristic
+  /**
+   * Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets.
+   *
+   * The p-value is the probability of obtaining test results at least as extreme as
+   * the results actually observed, under the assumption that the null hypothesis is
+   * correct. The p-value is calculated assuming that the foreground set and the
+   * background set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null
+   * hypothesis that the probabilities are the same.
+   */
+  p_value?: PValueHeuristic
   /**
    * Regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`.
    * Terms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`.