From 4f74c56de96d804f420544b543ded42dc4f76692 Mon Sep 17 00:00:00 2001 From: Caleb Hattingh Date: Sat, 6 Sep 2025 05:35:59 +0200 Subject: [PATCH] Helper methods for building boolean queries --- src/query.rs | 69 +++++++++++++++++++++++++++++++++++++++++++ tantivy/tantivy.pyi | 9 ++++++ tests/tantivy_test.py | 62 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) diff --git a/src/query.rs b/src/query.rs index dc3acb61..ac90022b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -59,6 +59,36 @@ impl Query { pub(crate) fn get(&self) -> &dyn tv::query::Query { &self.inner } + + /// This is an internal helper method for the BooleanQuery + /// convenience methods (and_must_match, or_should_match, and_must_not_match). + fn combine_with( + &self, + other: Query, + self_occur: tv::query::Occur, + other_occur: tv::query::Occur, + ) -> Query { + type BooleanQuery = tv::query::BooleanQuery; + + let inner: Box = if let Some(boolean_query) = + self.inner.downcast_ref::() + { + let mut subqueries = boolean_query + .clauses() + .iter() + .map(|(occur, subquery)| (*occur, subquery.box_clone())) + .collect::>(); + subqueries.push((other_occur, other.inner.box_clone())); + Box::new(BooleanQuery::new(subqueries)) + } else { + Box::new(BooleanQuery::new(vec![ + (self_occur, self.inner.box_clone()), + (other_occur, other.inner.box_clone()), + ])) + }; + + Query { inner } + } } #[pymethods] @@ -219,6 +249,45 @@ impl Query { }) } + /// Convenience method to combine two queries with AND (MUST) logic. + /// If the current query is already a BooleanQuery, it adds the new query + /// as an additional MUST clause. Otherwise, it creates a new BooleanQuery + /// with both the current and new queries as MUST clauses. + #[pyo3(signature = (query))] + pub(crate) fn and_must_match(&self, query: Query) -> PyResult { + Ok(self.combine_with( + query, + tv::query::Occur::Must, + tv::query::Occur::Must, + )) + } + + /// Convenience method to combine two queries with AND (MUST NOT) logic. + /// If the current query is already a BooleanQuery, it adds the new query + /// as an additional MUST NOT clause. Otherwise, it creates a new BooleanQuery + /// with the current query as a MUST clause and the new query as a MUST NOT clause. + #[pyo3(signature = (query))] + pub(crate) fn and_must_not_match(&self, query: Query) -> PyResult { + Ok(self.combine_with( + query, + tv::query::Occur::Must, + tv::query::Occur::MustNot, + )) + } + + /// Convenience method to combine two queries with OR (SHOULD) logic. + /// If the current query is already a BooleanQuery, it adds the new query + /// as an additional SHOULD clause. Otherwise, it creates a new BooleanQuery + /// with both the current and new queries as SHOULD clauses. + #[pyo3(signature = (query))] + pub(crate) fn or_should_match(&self, query: Query) -> PyResult { + Ok(self.combine_with( + query, + tv::query::Occur::Should, + tv::query::Occur::Should, + )) + } + /// Construct a Tantivy's DisjunctionMaxQuery #[staticmethod] #[pyo3(signature = (subqueries, tie_breaker=None))] diff --git a/tantivy/tantivy.pyi b/tantivy/tantivy.pyi index 15a6b386..b5660d00 100644 --- a/tantivy/tantivy.pyi +++ b/tantivy/tantivy.pyi @@ -264,6 +264,15 @@ class Query: def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query: pass + def and_must_match(self, query: Query) -> Query: + pass + + def and_must_not_match(self, query: Query) -> Query: + pass + + def or_should_match(self, query: Query) -> Query: + pass + @staticmethod def disjunction_max_query( subqueries: Sequence[Query], tie_breaker: Optional[float] = None diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 8629a5dc..26da8f12 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -1129,6 +1129,68 @@ def test_boolean_query(self, ram_index): ] ) + def test_boolean_query_helpers(self, ram_index: tantivy.Index): + index = ram_index + searcher = index.searcher() + + # Queries for testing + query_sea = Query.term_query(index.schema, "title", "sea") # Matches "The Old Man and the Sea" + query_mice = Query.term_query(index.schema, "title", "mice") # Matches "Of Mice and Men" + query_old = Query.term_query(index.schema, "title", "old") # Matches "The Old Man and the Sea" + query_man = Query.term_query(index.schema, "title", "man") # Matches "The Old Man and the Sea" + + # Test and_must_match + # No document contains both "sea" and "mice" in the title + combined_must = query_sea.and_must_match(query_mice) + result = searcher.search(combined_must, 10) + assert len(result.hits) == 0 + + # "The Old Man and the Sea" contains both "old" and "man" + combined_must = query_old.and_must_match(query_man) + result = searcher.search(combined_must, 10) + assert len(result.hits) == 1 + searched_doc = searcher.doc(result.hits[0][1]) + assert searched_doc["title"] == ["The Old Man and the Sea"] + + # "The Old Man and the Sea" contains both "old" and "man" + # (but with many chains) + combined_must = ( + query_old + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + .and_must_match(query_man) + ) + result = searcher.search(combined_must, 10) + assert len(result.hits) == 1 + searched_doc = searcher.doc(result.hits[0][1]) + assert searched_doc["title"] == ["The Old Man and the Sea"] + + # Test or_should_match + # Should match documents containing either "sea" or "mice" + combined_should = query_sea.or_should_match(query_mice) + result = searcher.search(combined_should, 10) + assert len(result.hits) == 2 + titles = {searcher.doc(hit[1])["title"][0] for hit in result.hits} + assert "The Old Man and the Sea" in titles + assert "Of Mice and Men" in titles + + # Test and_must_not_match + # All 3 docs contain "and" in the body. We exclude the one with "sea" in the title. + query_and_body = Query.term_query(index.schema, "body", "and") + combined_must_not = query_and_body.and_must_not_match(query_sea) + result = searcher.search(combined_must_not, 10) + assert len(result.hits) == 2 + titles = {searcher.doc(hit[1])["title"][0] for hit in result.hits} + assert "The Old Man and the Sea" not in titles + def test_disjunction_max_query(self, ram_index): index = ram_index