Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,36 @@ impl Query {
pub(crate) fn get(&self) -> &dyn tv::query::Query {
&self.inner
}

/// This is an internal helper method for the BooleanQuery
/// convenience methods (and_must_match, or_should_match, and_must_not_match).
fn combine_with(
&self,
other: Query,
self_occur: tv::query::Occur,
other_occur: tv::query::Occur,
) -> Query {
type BooleanQuery = tv::query::BooleanQuery;

let inner: Box<dyn tv::query::Query> = if let Some(boolean_query) =
self.inner.downcast_ref::<BooleanQuery>()
Comment on lines +73 to +74
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently this only checks whether self is a boolean query and will then reuse that to append the new clause; however, it is possible that other is also a boolean query. So a further optimization can be made to either append to other (if a BQ), or to fully merge self and other if they're both BQ.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(some care must be taken when doing the appends, in case other_occur is a MUST NOT)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OTOH we may want to intentionally NOT incorporate optimizations for other, in light of #287 (comment). It may be desired to only pull things into an existing BooleanQuery when it's on the "left hand side".

{
let mut subqueries = boolean_query
.clauses()
.iter()
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
.collect::<Vec<_>>();
subqueries.push((other_occur, other.inner.box_clone()));
Box::new(BooleanQuery::new(subqueries))
} else {
Box::new(BooleanQuery::new(vec![
(self_occur, self.inner.box_clone()),
(other_occur, other.inner.box_clone()),
]))
};

Query { inner }
}
}

#[pymethods]
Expand Down Expand Up @@ -219,6 +249,45 @@ impl Query {
})
}

/// Convenience method to combine two queries with AND (MUST) logic.
/// If the current query is already a BooleanQuery, it adds the new query
/// as an additional MUST clause. Otherwise, it creates a new BooleanQuery
/// with both the current and new queries as MUST clauses.
#[pyo3(signature = (query))]
pub(crate) fn and_must_match(&self, query: Query) -> PyResult<Query> {
Ok(self.combine_with(
query,
tv::query::Occur::Must,
tv::query::Occur::Must,
))
}

/// Convenience method to combine two queries with AND (MUST NOT) logic.
/// If the current query is already a BooleanQuery, it adds the new query
/// as an additional MUST NOT clause. Otherwise, it creates a new BooleanQuery
/// with the current query as a MUST clause and the new query as a MUST NOT clause.
#[pyo3(signature = (query))]
pub(crate) fn and_must_not_match(&self, query: Query) -> PyResult<Query> {
Ok(self.combine_with(
query,
tv::query::Occur::Must,
tv::query::Occur::MustNot,
))
}

/// Convenience method to combine two queries with OR (SHOULD) logic.
/// If the current query is already a BooleanQuery, it adds the new query
/// as an additional SHOULD clause. Otherwise, it creates a new BooleanQuery
/// with both the current and new queries as SHOULD clauses.
#[pyo3(signature = (query))]
pub(crate) fn or_should_match(&self, query: Query) -> PyResult<Query> {
Ok(self.combine_with(
query,
tv::query::Occur::Should,
tv::query::Occur::Should,
))
}

/// Construct a Tantivy's DisjunctionMaxQuery
#[staticmethod]
#[pyo3(signature = (subqueries, tie_breaker=None))]
Expand Down
9 changes: 9 additions & 0 deletions tantivy/tantivy.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,15 @@ class Query:
def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query:
pass

def and_must_match(self, query: Query) -> Query:
pass

def and_must_not_match(self, query: Query) -> Query:
pass

def or_should_match(self, query: Query) -> Query:
pass

@staticmethod
def disjunction_max_query(
subqueries: Sequence[Query], tie_breaker: Optional[float] = None
Expand Down
62 changes: 62 additions & 0 deletions tests/tantivy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,68 @@ def test_boolean_query(self, ram_index):
]
)

def test_boolean_query_helpers(self, ram_index: tantivy.Index):
index = ram_index
searcher = index.searcher()

# Queries for testing
query_sea = Query.term_query(index.schema, "title", "sea") # Matches "The Old Man and the Sea"
query_mice = Query.term_query(index.schema, "title", "mice") # Matches "Of Mice and Men"
query_old = Query.term_query(index.schema, "title", "old") # Matches "The Old Man and the Sea"
query_man = Query.term_query(index.schema, "title", "man") # Matches "The Old Man and the Sea"

# Test and_must_match
# No document contains both "sea" and "mice" in the title
combined_must = query_sea.and_must_match(query_mice)
result = searcher.search(combined_must, 10)
assert len(result.hits) == 0

# "The Old Man and the Sea" contains both "old" and "man"
combined_must = query_old.and_must_match(query_man)
result = searcher.search(combined_must, 10)
assert len(result.hits) == 1
searched_doc = searcher.doc(result.hits[0][1])
assert searched_doc["title"] == ["The Old Man and the Sea"]

# "The Old Man and the Sea" contains both "old" and "man"
# (but with many chains)
combined_must = (
query_old
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
.and_must_match(query_man)
)
result = searcher.search(combined_must, 10)
assert len(result.hits) == 1
searched_doc = searcher.doc(result.hits[0][1])
assert searched_doc["title"] == ["The Old Man and the Sea"]

# Test or_should_match
# Should match documents containing either "sea" or "mice"
combined_should = query_sea.or_should_match(query_mice)
result = searcher.search(combined_should, 10)
assert len(result.hits) == 2
titles = {searcher.doc(hit[1])["title"][0] for hit in result.hits}
assert "The Old Man and the Sea" in titles
assert "Of Mice and Men" in titles

# Test and_must_not_match
# All 3 docs contain "and" in the body. We exclude the one with "sea" in the title.
query_and_body = Query.term_query(index.schema, "body", "and")
combined_must_not = query_and_body.and_must_not_match(query_sea)
result = searcher.search(combined_must_not, 10)
assert len(result.hits) == 2
titles = {searcher.doc(hit[1])["title"][0] for hit in result.hits}
assert "The Old Man and the Sea" not in titles

def test_disjunction_max_query(self, ram_index):
index = ram_index

Expand Down
Loading