-
-
Notifications
You must be signed in to change notification settings - Fork 421
perf: Optimize searching tags with DB indexes #1129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
ea288e9
1d7a267
8ce773b
a1dd8d4
30d9403
06dc5e8
9ff3364
dba9e24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,6 @@ | |
| DB_VERSION_LEGACY_KEY, | ||
| JSON_FILENAME, | ||
| SQL_FILENAME, | ||
| TAG_CHILDREN_QUERY, | ||
| ) | ||
| from tagstudio.core.library.alchemy.db import make_tables | ||
| from tagstudio.core.library.alchemy.enums import ( | ||
|
|
@@ -555,6 +554,20 @@ def open_sqlite_library(self, library_dir: Path, is_new: bool) -> LibraryStatus: | |
| # Convert file extension list to ts_ignore file, if a .ts_ignore file does not exist | ||
| self.migrate_sql_to_ts_ignore(library_dir) | ||
|
|
||
| session.execute( | ||
| text("CREATE INDEX IF NOT EXISTS idx_tags_name_shorthand ON tags (name, shorthand)") | ||
| ) | ||
| session.execute( | ||
| text( | ||
| "CREATE INDEX IF NOT EXISTS idx_tag_parents_child_id ON tag_parents (child_id)" | ||
| ) | ||
| ) | ||
| session.execute( | ||
| text( | ||
| "CREATE INDEX IF NOT EXISTS idx_tag_entries_entry_id ON tag_entries (entry_id)" | ||
| ) | ||
| ) | ||
|
|
||
| # Update DB_VERSION | ||
| if loaded_db_version < DB_VERSION: | ||
| self.set_version(DB_VERSION_CURRENT_KEY, DB_VERSION) | ||
|
|
@@ -1054,55 +1067,61 @@ def search_library( | |
|
|
||
| return res | ||
|
|
||
| def search_tags(self, name: str | None, limit: int = 100) -> list[set[Tag]]: | ||
| def search_tags(self, name: str | None, limit: int = 100) -> tuple[list[Tag], list[Tag]]: | ||
| """Return a list of Tag records matching the query.""" | ||
| name = name or "" | ||
| name = name.lower() | ||
|
|
||
| def sort_key(text: str): | ||
| return (not text.startswith(name), len(text), text) | ||
|
|
||
| with Session(self.engine) as session: | ||
| query = select(Tag).outerjoin(TagAlias).order_by(func.lower(Tag.name)) | ||
| query = query.options( | ||
| selectinload(Tag.parent_tags), | ||
| selectinload(Tag.aliases), | ||
| ) | ||
| if limit > 0: | ||
| query = query.limit(limit) | ||
| query = select(Tag.id, Tag.name) | ||
|
|
||
| if limit > 0 and not name: | ||
| query = query.limit(limit).order_by(func.lower(Tag.name)) | ||
|
|
||
| if name: | ||
| query = query.where( | ||
| or_( | ||
| Tag.name.icontains(name), | ||
| Tag.shorthand.icontains(name), | ||
| TagAlias.name.icontains(name), | ||
| ) | ||
| ) | ||
|
|
||
| direct_tags = set(session.scalars(query)) | ||
| ancestor_tag_ids: list[Tag] = [] | ||
| for tag in direct_tags: | ||
| ancestor_tag_ids.extend( | ||
| list(session.scalars(TAG_CHILDREN_QUERY, {"tag_id": tag.id})) | ||
| ) | ||
|
|
||
| ancestor_tags = session.scalars( | ||
| select(Tag) | ||
| .where(Tag.id.in_(ancestor_tag_ids)) | ||
| .options(selectinload(Tag.parent_tags), selectinload(Tag.aliases)) | ||
| ) | ||
| tags = list(session.execute(query)) | ||
|
|
||
| res = [ | ||
| direct_tags, | ||
| {at for at in ancestor_tags if at not in direct_tags}, | ||
| ] | ||
| if name: | ||
| query = select(TagAlias.tag_id, TagAlias.name).where(TagAlias.name.icontains(name)) | ||
| tags.extend(session.execute(query)) | ||
|
|
||
| tags.sort(key=lambda t: sort_key(t[1])) | ||
| seen_ids = set() | ||
| tag_ids = [] | ||
| for row in tags: | ||
| id = row[0] | ||
| if id in seen_ids: | ||
| continue | ||
| tag_ids.append(id) | ||
| seen_ids.add(id) | ||
|
||
|
|
||
| logger.info( | ||
| "searching tags", | ||
| search=name, | ||
| limit=limit, | ||
| statement=str(query), | ||
| results=len(res), | ||
| results=len(tag_ids), | ||
| ) | ||
|
|
||
| session.expunge_all() | ||
| if limit <= 0: | ||
| limit = len(tag_ids) | ||
| tag_ids = tag_ids[:limit] | ||
|
|
||
| return res | ||
| hierarchy = self.get_tag_hierarchy(tag_ids) | ||
| direct_tags = [hierarchy.pop(id) for id in tag_ids] | ||
| ancestor_tags = list(hierarchy.values()) | ||
| ancestor_tags.sort(key=lambda t: sort_key(t.name)) | ||
| return direct_tags, ancestor_tags | ||
|
|
||
| def update_entry_path(self, entry_id: int | Entry, path: Path) -> bool: | ||
| """Set the path field of an entry. | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -218,32 +218,15 @@ def update_tags(self, query: str | None = None): | |||||||||||||
| self.scroll_layout.takeAt(self.scroll_layout.count() - 1).widget().deleteLater() | ||||||||||||||
| self.create_button_in_layout = False | ||||||||||||||
|
|
||||||||||||||
| # Get results for the search query | ||||||||||||||
| query_lower = "" if not query else query.lower() | ||||||||||||||
| # Only use the tag limit if it's an actual number (aka not "All Tags") | ||||||||||||||
| tag_limit = TagSearchPanel.tag_limit if isinstance(TagSearchPanel.tag_limit, int) else -1 | ||||||||||||||
| tag_results: list[set[Tag]] = self.lib.search_tags(name=query, limit=tag_limit) | ||||||||||||||
| if self.exclude: | ||||||||||||||
| tag_results[0] = {t for t in tag_results[0] if t.id not in self.exclude} | ||||||||||||||
| tag_results[1] = {t for t in tag_results[1] if t.id not in self.exclude} | ||||||||||||||
|
|
||||||||||||||
| # Sort and prioritize the results | ||||||||||||||
| results_0 = list(tag_results[0]) | ||||||||||||||
| results_0.sort(key=lambda tag: tag.name.lower()) | ||||||||||||||
| results_1 = list(tag_results[1]) | ||||||||||||||
| results_1.sort(key=lambda tag: tag.name.lower()) | ||||||||||||||
| raw_results = list(results_0 + results_1) | ||||||||||||||
| priority_results: set[Tag] = set() | ||||||||||||||
| all_results: list[Tag] = [] | ||||||||||||||
| direct_tags, ancestor_tags = self.lib.search_tags(name=query, limit=tag_limit) | ||||||||||||||
|
|
||||||||||||||
| if query and query.strip(): | ||||||||||||||
| for tag in raw_results: | ||||||||||||||
| if tag.name.lower().startswith(query_lower): | ||||||||||||||
| priority_results.add(tag) | ||||||||||||||
| all_results = [t for t in direct_tags if t.id not in self.exclude] | ||||||||||||||
| for tag in ancestor_tags: | ||||||||||||||
| if tag.id not in self.exclude: | ||||||||||||||
| all_results.append(tag) | ||||||||||||||
|
||||||||||||||
| all_results = [t for t in direct_tags if t.id not in self.exclude] | |
| for tag in ancestor_tags: | |
| if tag.id not in self.exclude: | |
| all_results.append(tag) | |
| all_results = [t for t in direct_tags if t.id not in self.exclude] | |
| all_results += [t for t in ancestor_tags if t.id not in self.exclude] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ended up doing this to avoid creating extra lists.
all_results.extend(t for t in ancestor_tags if t.id not in self.exclude)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This causes the sorting to happen after truncating the results, which differs from the behaviour when searching, no?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes this was not sorting by
len(text)before truncating. The previous implementation only sorted priority results by len so I've updatedsort_keyto do that. Which will make thisorder_bystatement andsort_keyproduce the same results when no query is provided.