Skip to content
Merged
Changes from 2 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
c84577a
WIP: Draft on parsing <dl> as normal section
benjaoming Mar 8, 2023
13c7946
Drafting what to remove
benjaoming Mar 8, 2023
780dec0
domain_data is no longer generated
benjaoming Mar 8, 2023
0968e01
Update generic logic, remove old Sphinx <dl> parsing
benjaoming Mar 13, 2023
5797c6a
Improve parsing with "General sibling combinator", add test case data
benjaoming Mar 13, 2023
b87d3ee
Add httpdomain example
benjaoming Mar 20, 2023
cc77f5e
test case for Sphinx autodoc HTML
benjaoming Mar 20, 2023
71a4018
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Mar 23, 2023
612c687
Remove entire block that was indexing Sphinx domains
benjaoming Mar 23, 2023
562d18b
Clean up remaining Sphinx domain search index
benjaoming Mar 23, 2023
508658c
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Mar 27, 2023
f6cd1d4
Strip out Sphinx line numbers in the generic `_clean_body` method
benjaoming Mar 27, 2023
147d1ce
Remove indexed <dl> contents before indexing other sections
benjaoming Mar 27, 2023
3eb302c
Update our developer docs for section indexes a bit
benjaoming Mar 27, 2023
bec3bdb
Apply suggestions from @stsewd code review
benjaoming Mar 28, 2023
e690970
Apply suggestions from @stsewd code review
benjaoming Mar 28, 2023
f17d500
Removes nodes immediately, it seems that selectolax does DFS but no g…
benjaoming Mar 30, 2023
b19f587
Update docs
benjaoming Mar 30, 2023
4f6be90
Use 1-line generator
benjaoming Mar 30, 2023
d953380
Update basic example showing that it uses BFS for indexing
benjaoming Mar 30, 2023
c38f8ab
No future code path
benjaoming Mar 30, 2023
cfac4ef
Remember all <dls> that were already seen before indexing them
benjaoming Mar 30, 2023
f4f97a5
Generate output that resembles Depth-First-Search (even though we tra…
benjaoming Apr 6, 2023
380aabf
Remove <dl>s from DOM before parsing content outside of <h1..7>s
benjaoming Apr 6, 2023
d9a3021
Parse only dl>dt for each dl using a hack, remove decompose() call th…
benjaoming Apr 10, 2023
0a11690
Update test data (manually sample-read a lot and it looks good, no du…
benjaoming Apr 10, 2023
e7c4946
Also remove already indexed <dt>s from DOM before continuing to index
benjaoming Apr 10, 2023
5241741
Split <dl> parsing into separate method
benjaoming Apr 10, 2023
abbd033
Use css selectors when possible, this one works
benjaoming Apr 10, 2023
58364d0
Remove print() statements
benjaoming Apr 10, 2023
2d8d585
Cleanup: Remove inaccurate comment
benjaoming Apr 10, 2023
952142a
Cleanup: Select adjacent dd instead of iterating
benjaoming Apr 10, 2023
3de9e39
Fix strange syntax
benjaoming Apr 10, 2023
c1a0287
Do not accumulate lists: Yield indexed nodes and section content
benjaoming Apr 10, 2023
0231e6c
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Apr 10, 2023
012e8dc
Appease "darker" lint
benjaoming Apr 10, 2023
4170338
Reduce complexity: replace css selector with a Python look
benjaoming Apr 10, 2023
ffca7ad
Use "simple" analyzer on section contents
benjaoming Apr 11, 2023
8bc1450
Merge branch 'generic-html-parser-dls-remove-sphinx-domain' of github…
benjaoming Apr 11, 2023
17dfb8a
Revert "Use "simple" analyzer on section contents"
benjaoming Apr 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,40 @@ def _parse_sections(self, title, body):
'title': title,
'content': content,
}
except Exception as e:
log.info("Unable to index section.", section=str(e))

dls = body.css("dl")
for dl in dls:
dts = dl.css("dt")

for dt in dts:
try:
title, id = self._parse_dt(tag)
next_element = dt.next
# We only index a dt with an accompanying dd
if next_element.tag != "dd":
continue
content, _ = self._parse_section_content(next_element, depth=2)
yield {
"id": id,
"title": title,
"content": content,
}
except Exception as e:
log.info('Unable to index section.', section=str(e))

def _parse_dt(self, tag):
"""
Parses a definition term <dt>
"""
section_id = tag.attributes.get("id", "")
if not section_id:
parent = tag.parent
section_id = parent.attributes.get("id", "")

return self._parse_content(tag.text()), section_id

def _get_sections(self, title, body):
"""Get the first `self.max_inner_documents` sections."""
iterator = self._parse_sections(title=title, body=body)
Expand Down Expand Up @@ -407,7 +438,6 @@ def _process_fjson(self, fjson_path):
sections = []
path = ''
title = ''
domain_data = {}

if 'current_page_name' in data:
path = data['current_page_name']
Expand Down Expand Up @@ -435,21 +465,23 @@ def _process_fjson(self, fjson_path):
try:
# Create a new html object, since the previous one could have been modified.
body = HTMLParser(data["body"])
domain_data = self._generate_domains_data(body)
# domain_data = self._generate_domains_data(body)
except Exception:
log.info("Unable to index domains.", path=fjson_path)
else:
log.info('Unable to index content.', path=fjson_path)

return {
'path': path,
'title': title,
'sections': sections,
'domain_data': domain_data,
"path": path,
"title": title,
"sections": sections,
"domain_data": {}, # domain_data,
}

def _get_sphinx_domains(self, body):
"""
REMOVING THIS

Get all nodes that are a sphinx domain.

A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
Expand All @@ -476,10 +508,9 @@ def _clean_body(self, body):
# while we migrate the ID type of the sphinx domains table
# https://github.com/readthedocs/readthedocs.org/pull/9482.
nodes_to_be_removed = []
from readthedocs.projects.models import Feature

if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
nodes_to_be_removed = self._get_sphinx_domains(body)
# if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
# nodes_to_be_removed = self._get_sphinx_domains(body)

# TODO: see if we really need to remove these
# remove `Table of Contents` elements
Expand All @@ -493,6 +524,8 @@ def _clean_body(self, body):

def _generate_domains_data(self, body):
"""
REMOVING THIS

Generate sphinx domain objects' docstrings.

Returns a dict with the generated data.
Expand Down