Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
c84577a
WIP: Draft on parsing <dl> as normal section
benjaoming Mar 8, 2023
13c7946
Drafting what to remove
benjaoming Mar 8, 2023
780dec0
domain_data is no longer generated
benjaoming Mar 8, 2023
0968e01
Update generic logic, remove old Sphinx <dl> parsing
benjaoming Mar 13, 2023
5797c6a
Improve parsing with "General sibling combinator", add test case data
benjaoming Mar 13, 2023
b87d3ee
Add httpdomain example
benjaoming Mar 20, 2023
cc77f5e
test case for Sphinx autodoc HTML
benjaoming Mar 20, 2023
71a4018
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Mar 23, 2023
612c687
Remove entire block that was indexing Sphinx domains
benjaoming Mar 23, 2023
562d18b
Clean up remaining Sphinx domain search index
benjaoming Mar 23, 2023
508658c
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Mar 27, 2023
f6cd1d4
Strip out Sphinx line numbers in the generic `_clean_body` method
benjaoming Mar 27, 2023
147d1ce
Remove indexed <dl> contents before indexing other sections
benjaoming Mar 27, 2023
3eb302c
Update our developer docs for section indexes a bit
benjaoming Mar 27, 2023
bec3bdb
Apply suggestions from @stsewd code review
benjaoming Mar 28, 2023
e690970
Apply suggestions from @stsewd code review
benjaoming Mar 28, 2023
f17d500
Removes nodes immediately, it seems that selectolax does DFS but no g…
benjaoming Mar 30, 2023
b19f587
Update docs
benjaoming Mar 30, 2023
4f6be90
Use 1-line generator
benjaoming Mar 30, 2023
d953380
Update basic example showing that it uses BFS for indexing
benjaoming Mar 30, 2023
c38f8ab
No future code path
benjaoming Mar 30, 2023
cfac4ef
Remember all <dls> that were already seen before indexing them
benjaoming Mar 30, 2023
f4f97a5
Generate output that resembles Depth-First-Search (even though we tra…
benjaoming Apr 6, 2023
380aabf
Remove <dl>s from DOM before parsing content outside of <h1..7>s
benjaoming Apr 6, 2023
d9a3021
Parse only dl>dt for each dl using a hack, remove decompose() call th…
benjaoming Apr 10, 2023
0a11690
Update test data (manually sample-read a lot and it looks good, no du…
benjaoming Apr 10, 2023
e7c4946
Also remove already indexed <dt>s from DOM before continuing to index
benjaoming Apr 10, 2023
5241741
Split <dl> parsing into separate method
benjaoming Apr 10, 2023
abbd033
Use css selectors when possible, this one works
benjaoming Apr 10, 2023
58364d0
Remove print() statements
benjaoming Apr 10, 2023
2d8d585
Cleanup: Remove inaccurate comment
benjaoming Apr 10, 2023
952142a
Cleanup: Select adjacent dd instead of iterating
benjaoming Apr 10, 2023
3de9e39
Fix strange syntax
benjaoming Apr 10, 2023
c1a0287
Do not accumulate lists: Yield indexed nodes and section content
benjaoming Apr 10, 2023
0231e6c
Merge branch 'main' of github.com:readthedocs/readthedocs.org into ge…
benjaoming Apr 10, 2023
012e8dc
Appease "darker" lint
benjaoming Apr 10, 2023
4170338
Reduce complexity: replace css selector with a Python look
benjaoming Apr 10, 2023
ffca7ad
Use "simple" analyzer on section contents
benjaoming Apr 11, 2023
8bc1450
Merge branch 'generic-html-parser-dls-remove-sphinx-domain' of github…
benjaoming Apr 11, 2023
17dfb8a
Revert "Use "simple" analyzer on section contents"
benjaoming Apr 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 50 additions & 80 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,16 +145,54 @@ def _parse_sections(self, title, body):
tags = body.css(f'h{head_level}')
for tag in tags:
try:
title, id = self._parse_section_title(tag)
title, _id = self._parse_section_title(tag)
next_tag = self._get_header_container(tag).next
content, _ = self._parse_section_content(next_tag, depth=2)
yield {
'id': id,
'title': title,
'content': content,
"id": _id,
"title": title,
"content": content,
}
except Exception as e:
log.info('Unable to index section.', section=str(e))
log.info("Unable to index section.", section=str(e))

# All terms in dls are treated as sections.
dls = body.css("dl")
for dl in dls:
# Select all dts with id defined
dts = dl.css('dt[id]:not([id=""])')

# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/dt
# multiple <dt> elements in a row indicate several terms that are
# all defined by the immediate next <dd> element.
for dt in dts:
try:
title, _id = self._parse_dt(dt)
# https://developer.mozilla.org/en-US/docs/Web/CSS/General_sibling_combinator
dd = dt.css_first("dt ~ dd")
# We only index a dt with an id attribute and an accompanying dd
if not dd or not _id:
continue

# The content of the <dt> section is the content of the accompanying <dd>
content = dd.text()
yield {
"id": _id,
"title": title,
"content": content,
}
except Exception as e:
log.info("Unable to index dt section.", section=str(e))

def _parse_dt(self, tag):
"""
Parses a definition term <dt>.

If the <dt> does not have an id attribute, it cannot be referenced.
This should be understood by the caller.
"""
section_id = tag.attributes.get("id", "")
return self._parse_content(tag.text()), section_id

def _get_sections(self, title, body):
"""Get the first `self.max_inner_documents` sections."""
Expand Down Expand Up @@ -407,7 +445,6 @@ def _process_fjson(self, fjson_path):
sections = []
path = ''
title = ''
domain_data = {}

if 'current_page_name' in data:
path = data['current_page_name']
Expand Down Expand Up @@ -435,34 +472,19 @@ def _process_fjson(self, fjson_path):
try:
# Create a new html object, since the previous one could have been modified.
body = HTMLParser(data["body"])
domain_data = self._generate_domains_data(body)
except Exception:
log.info("Unable to index domains.", path=fjson_path)
else:
log.info('Unable to index content.', path=fjson_path)

return {
'path': path,
'title': title,
'sections': sections,
'domain_data': domain_data,
"path": path,
"title": title,
"sections": sections,
# this used to contain content from <dl> but this is now handled in a generic parser
"domain_data": {},
}

def _get_sphinx_domains(self, body):
"""
Get all nodes that are a sphinx domain.

A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
dl tags that have the "footnote" class aren't domains.
"""
domains = []
dl_tags = body.css("dl:has(dt[id])")
for tag in dl_tags:
classes = tag.attributes.get("class", "").split()
if "footnote" not in classes:
domains.append(tag)
return domains

def _clean_body(self, body):
"""
Removes sphinx domain nodes.
Expand All @@ -476,10 +498,9 @@ def _clean_body(self, body):
# while we migrate the ID type of the sphinx domains table
# https://github.com/readthedocs/readthedocs.org/pull/9482.
nodes_to_be_removed = []
from readthedocs.projects.models import Feature

if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
nodes_to_be_removed = self._get_sphinx_domains(body)
# if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
# nodes_to_be_removed = self._get_sphinx_domains(body)

# TODO: see if we really need to remove these
# remove `Table of Contents` elements
Expand All @@ -491,57 +512,6 @@ def _clean_body(self, body):

return body

def _generate_domains_data(self, body):
"""
Generate sphinx domain objects' docstrings.

Returns a dict with the generated data.
The returned dict is in the following form::

{
"domain-id-1": "docstrings for the domain-id-1",
"domain-id-2": "docstrings for the domain-id-2",
}

.. note::

Only the first `self.max_inner_documents` domains are returned.
"""

domain_data = {}
dl_tags = self._get_sphinx_domains(body)
number_of_domains = 0

for dl_tag in dl_tags:

dt = dl_tag.css('dt')
dd = dl_tag.css('dd')

# len(dt) should be equal to len(dd)
# because these tags go together.
for title, desc in zip(dt, dd):
try:
id_ = title.attributes.get('id', '')
if id_:
# Create a copy of the node,
# since _parse_domain_tag will modify it.
copy_desc = HTMLParser(desc.html).body.child
docstrings = self._parse_domain_tag(copy_desc)
domain_data[id_] = docstrings
number_of_domains += 1
if number_of_domains >= self.max_inner_documents:
log.warning(
'Limit of inner domains exceeded.',
project_slug=self.project.slug,
version_slug=self.version.slug,
limit=self.max_inner_documents,
)
break
except Exception:
log.exception('Error parsing docstring for domains')

return domain_data

def _parse_domain_tag(self, tag):
"""Returns the text from the description tag of the domain."""

Expand Down
9 changes: 9 additions & 0 deletions readthedocs/search/tests/data/generic/in/basic.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,14 @@
</head>
<body>
Content of the body.
<dl>
<dt id="love">Love</dt>
<dd>To code is to love</dd>
<dt>No section for me</dt>
<dd>This term does not have an ID so it's not gonna get its own section</dd>
<dt id="code">Code</dt>
<dt id="docs">Docs</dt>
<dd>Code and docs are like love, they are 4-letter words.</dd>
</dl>
</body>
</html>
17 changes: 16 additions & 1 deletion readthedocs/search/tests/data/generic/out/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,22 @@
{
"id": "",
"title": "Title of the page",
"content": "Content of the body."
"content": "Content of the body. Love To code is to love No section for me This term does not have an ID so it's not gonna get its own section Code Docs Code and docs are like love, they are 4-letter words."
},
{
"id": "love",
"title": "Love",
"content": "To code is to love"
},
{
"id": "code",
"title": "Code",
"content": "Code and docs are like love, they are 4-letter words."
},
{
"id": "docs",
"title": "Docs",
"content": "Code and docs are like love, they are 4-letter words."
}
],
"domain_data": {}
Expand Down
Loading