readthedocs · benjaoming · Apr 11, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023
@@ -117,6 +117,13 @@ Tags to be ignored:
 
 - ``nav``
 
+Special rules that are derived from specific documentation tools applied in the generic parser:
+
+.. https://squidfunk.github.io/mkdocs-material/reference/code-blocks/#adding-line-numbers
+
+- ``.linenos``, ``.lineno`` (line numbers in code-blocks, comes from both MkDocs and Sphinx)
+- ``.headerlink`` (added by Sphinx to links in headers)
+
 Example:
 
 .. code-block:: html
@@ -133,12 +140,27 @@ Example:
 Sections
 ~~~~~~~~
 
-Sections are composed of a title, and a content.
-A section title can be a ``h`` tag, or a ``header`` tag containing a ``h`` tag,
-the ``h`` tag or its parent can contain an ``id`` attribute, which will be used to link to the section.
+Sections are stored in a dictionary composed of an ``id``, ``title`` and ``content`` key.
 
-All content below the title, until a new section is found, will be indexed as part of the section content.
-Example:
+Sections are defined as:
+
+* ``h1-h7``, all content between one heading level and the next header on the same level is used as content for that section.
+* ``dt`` elements with an ``id`` attribute, we map the ``title`` to the ``dt`` element and the content to the ``dd`` element.
+
+All sections have to be identified by a DOM container's ``id`` attribute,
+which will be used to link to the section.
+How the id is detected varies with the type of element:
+
+* ``h1-h7`` elements use the ``id`` attribute of the header itself if present, or
+  its ``section`` parent (if exists).
+* ``dt`` elements use the ``id`` attribute of the ``dt`` element.
+
+To avoid duplication and ambiguous section references,
+all indexed ``dl`` elements are removed from the DOM before indexing of other sections happen.
+
+Here is an example of how all content below the title,
+until a new section is found,
+will be indexed as part of the section content:
 
 .. code-block:: html
    :emphasize-lines: 2-10, 12-17, 21-26

@@ -123,6 +123,21 @@ def _parse_sections(self, title, body):
         We can have pages that have content before the first title or that don't have a title,
         we index that content first under the title of the original page.
         """
+
+        document_title = title
+
+        indexed_nodes = []
+
+        for dd, dt, section in self._parse_dls(body):
+            indexed_nodes.append(dd)
+            indexed_nodes.append(dt)
+            yield section
+
+        # Remove all seen and indexed data outside of traversal.
+        # We want to avoid modifying the DOM tree while traversing it.
+        for node in indexed_nodes:
+            node.decompose()
+
         # Index content for pages that don't start with a title.
         # We check for sections till 3 levels to avoid indexing all the content
         # in this step.
@@ -133,28 +148,94 @@ def _parse_sections(self, title, body):
             )
             if content:
                 yield {
-                    'id': '',
-                    'title': title,
-                    'content': content,
+                    "id": "",
+                    "title": document_title,
+                    "content": content,
                 }
         except Exception as e:
-            log.info('Unable to index section', section=str(e))
+            log.info("Unable to index section", section=str(e))
 
         # Index content from h1 to h6 headers.
-        for head_level in range(1, 7):
-            tags = body.css(f'h{head_level}')
-            for tag in tags:
+        for section in [body.css(f"h{h}") for h in range(1, 7)]:
+            for tag in section:
                 try:
-                    title, id = self._parse_section_title(tag)
+                    title, _id = self._parse_section_title(tag)
                     next_tag = self._get_header_container(tag).next
                     content, _ = self._parse_section_content(next_tag, depth=2)
                     yield {
-                        'id': id,
-                        'title': title,
-                        'content': content,
+                        "id": _id,
+                        "title": title,
+                        "content": content,
                     }
-                except Exception as e:
-                    log.info('Unable to index section.', section=str(e))
+                except Exception:
+                    log.info("Unable to index section.", exc_info=True)
+
+    def _parse_dls(self, body):
+
+        # All terms in <dl>s are treated as sections.
+        # We traverse by <dl> - traversing by <dt> has shown in experiments to render a
+        # different traversal order, which could make the tests more unstable.
+        dls = body.css("dl")
+
+        for dl in dls:
+
+            # Hack: Since we cannot use '> dt' nor ':host' in selectolax/Modest,
+            # we use an iterator to select immediate descendants.
+            dts = (node for node in dl.iter() if node.tag == "dt" and node.id)
+
+            # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/dt
+            # multiple <dt> elements in a row indicate several terms that are
+            # all defined by the immediate next <dd> element.
+            for dt in dts:
+                title, _id = self._parse_dt(dt)
+                # Select the first adjacent <dd> using a "gamble" that seems to work.
+                # In this example, we cannot use the current <dt>'s ID because they contain invalid
+                # CSS selector syntax and there's no apparent way to fix that.
+                # https://developer.mozilla.org/en-US/docs/Web/CSS/General_sibling_combinator
+                dd = dt.css_first("dt ~ dd")
+
+                # We only index a dt with an id attribute and an accompanying dd
+                if not dd or not _id:
+                    continue
+
+                # Create a copy of the node to avoid manipulating the
+                # data structure that we're iterating over
+                dd_copy = HTMLParser(dd.html).body.child
+
+                # Remove all nested domains from dd_copy.
+                # They are already parsed separately.
+                for node in dd_copy.css("dl"):
+                    # Traverse all <dt>s with an ID (the ones we index!)
+                    for _dt in node.css('dt[id]:not([id=""])'):
+                        # Fetch adjacent <dd>s and remove them
+                        _dd_dt = _dt.css_first("dt ~ dd")
+                        if _dd_dt:
+                            _dd_dt.decompose()
+                        # Remove the <dt> too
+                        _dt.decompose()
+
+                # The content of the <dt> section is the content of the accompanying <dd>
+                content = self._parse_content(dd_copy.text())
+
+                yield (
+                    dd,
+                    dt,
+                    {
+                        "id": _id,
+                        "title": title,
+                        "content": content,
+                    },
+                )
+
+    def _parse_dt(self, tag):
+        """
+        Parses a definition term <dt>.
+
+        If the <dt> does not have an id attribute, it cannot be referenced.
+        This should be understood by the caller.
+        """
+        section_id = tag.attributes.get("id", "")
+        return self._parse_content(tag.text()), section_id
 
     def _get_sections(self, title, body):
         """Get the first `self.max_inner_documents` sections."""
@@ -177,6 +258,9 @@ def _clean_body(self, body):
         """
         Removes nodes with irrelevant content before parsing its sections.
 
+        This method is documented here:
+        https://dev.readthedocs.io/page/search-integration.html#irrelevant-content
+
         .. warning::
 
            This will mutate the original `body`.
@@ -188,6 +272,10 @@ def _clean_body(self, body):
             body.css('[role=search]'),
             # Permalinks
             body.css('.headerlink'),
+            # Line numbers from code blocks, they are very noisy in contents.
+            # This convention is popular in Sphinx.
+            body.css(".linenos"),
+            body.css(".lineno"),
         )
         for node in nodes_to_be_removed:
             node.decompose()
@@ -407,7 +495,6 @@ def _process_fjson(self, fjson_path):
         sections = []
         path = ''
         title = ''
-        domain_data = {}
 
         if 'current_page_name' in data:
             path = data['current_page_name']
@@ -424,137 +511,44 @@ def _process_fjson(self, fjson_path):
             try:
                 body = self._clean_body(HTMLParser(data["body"]))
                 sections = self._get_sections(title=title, body=body.body)
-            except Exception:
-                log.info('Unable to index sections.', path=fjson_path)
-
-            # XXX: Don't index domains while we migrate the ID type of the sphinx domains table.
-            # https://github.com/readthedocs/readthedocs.org/pull/9482.
-            from readthedocs.projects.models import Feature
-
-            if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
-                try:
-                    # Create a new html object, since the previous one could have been modified.
-                    body = HTMLParser(data["body"])
-                    domain_data = self._generate_domains_data(body)
-                except Exception:
-                    log.info("Unable to index domains.", path=fjson_path)
+            except Exception as e:
+                log.info("Unable to index sections.", path=fjson_path, exception=e)
         else:
             log.info('Unable to index content.', path=fjson_path)
 
         return {
-            'path': path,
-            'title': title,
-            'sections': sections,
-            'domain_data': domain_data,
+            "path": path,
+            "title": title,
+            "sections": sections,
+            "domain_data": {},  # deprecated
         }
 
-    def _get_sphinx_domains(self, body):
-        """
-        Get all nodes that are a sphinx domain.
-
-        A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
-        dl tags that have the "footnote" class aren't domains.
-        """
-        domains = []
-        dl_tags = body.css("dl:has(dt[id])")
-        for tag in dl_tags:
-            classes = tag.attributes.get("class", "").split()
-            if "footnote" not in classes:
-                domains.append(tag)
-        return domains
-
     def _clean_body(self, body):
         """
-        Removes sphinx domain nodes.
+        Removes nodes in Sphinx-generated HTML structures.
 
         This method is overridden to remove contents that are likely
-        to be a sphinx domain (`dl` tags).
-        We already index those in another step.
+        to be useless for search indexing.
+
+        Currently: TOC elements.
         """
         body = super()._clean_body(body)
-        # XXX: Don't exclude domains from the general search
-        # while we migrate the ID type of the sphinx domains table
-        # https://github.com/readthedocs/readthedocs.org/pull/9482.
-        nodes_to_be_removed = []
-        from readthedocs.projects.models import Feature
 
-        if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
-            nodes_to_be_removed = self._get_sphinx_domains(body)
-
-        # TODO: see if we really need to remove these
-        # remove `Table of Contents` elements
-        nodes_to_be_removed += body.css('.toctree-wrapper') + body.css('.contents.local.topic')
+        # TODO: see if we really need to remove TOC elements like below?
+        # benjaoming: I didn't see this in sphinx-rtd-theme, however since it wraps the menu in
+        # a <nav>, it's already covered. I didn't see this match local table of contents, neither
+        # and they are also wrapped in a <nav> so covered by _clean_body as well.
+        nodes_to_be_removed = itertools.chain(
+            body.css(".toctree-wrapper"),
+            body.css(".contents.local.topic"),
+        )
 
         # removing all nodes in list
         for node in nodes_to_be_removed:
             node.decompose()
 
         return body
 
-    def _generate_domains_data(self, body):
-        """
-        Generate sphinx domain objects' docstrings.
-
-        Returns a dict with the generated data.
-        The returned dict is in the following form::
-
-            {
-                "domain-id-1": "docstrings for the domain-id-1",
-                "domain-id-2": "docstrings for the domain-id-2",
-            }
-
-        .. note::
-
-           Only the first `self.max_inner_documents` domains are returned.
-        """
-
-        domain_data = {}
-        dl_tags = self._get_sphinx_domains(body)
-        number_of_domains = 0
-
-        for dl_tag in dl_tags:
-
-            dt = dl_tag.css('dt')
-            dd = dl_tag.css('dd')
-
-            # len(dt) should be equal to len(dd)
-            # because these tags go together.
-            for title, desc in zip(dt, dd):
-                try:
-                    id_ = title.attributes.get('id', '')
-                    if id_:
-                        # Create a copy of the node,
-                        # since _parse_domain_tag will modify it.
-                        copy_desc = HTMLParser(desc.html).body.child
-                        docstrings = self._parse_domain_tag(copy_desc)
-                        domain_data[id_] = docstrings
-                        number_of_domains += 1
-                    if number_of_domains >= self.max_inner_documents:
-                        log.warning(
-                            'Limit of inner domains exceeded.',
-                            project_slug=self.project.slug,
-                            version_slug=self.version.slug,
-                            limit=self.max_inner_documents,
-                        )
-                        break
-                except Exception:
-                    log.exception('Error parsing docstring for domains')
-
-        return domain_data
-
-    def _parse_domain_tag(self, tag):
-        """Returns the text from the description tag of the domain."""
-
-        # Remove all nested domains,
-        # since they are already parsed separately.
-        nested_domains = self._get_sphinx_domains(tag)
-        for node in nested_domains:
-            if tag != node:
-                node.decompose()
-
-        docstring = self._parse_content(tag.text())
-        return docstring
-
 
 class MkDocsParser(GenericParser):
 

@@ -6,5 +6,29 @@
   </head>
   <body>
     Content of the body.
+    <dl>
+      <dt id="love">Love</dt>
+      <dd>To code is to love</dd>
+      <dt>No section for me</dt>
+      <dd>This term does not have an ID so it's not gonna get its own section</dd>
+      <dt id="code">Code</dt>
+      <dt id="docs">Docs</dt>
+      <dd>Code and docs are like love, they are 4-letter words.</dd>
+      <dt id="complexity">Complexity</dt>
+      <dd>
+        <dl>
+          <dt id="mother">Mother</dt>
+          <dd>Mother cannot fly</dd>
+          <dt id="stones">All stones</dt>
+          <dd>All stones cannot fly</dd>
+          <dt id="ergo">Ergo</dt>
+          <dd>Mother is a stone</dd>
+        </dl>
+        <dl>
+          <dt>Sub-term of complexity</dt>
+          <dd>I am indexed in the #complexity section</dd>
+        </dl>
+      </dd>
+    </dl>
   </body>
 </html>