readthedocs · benjaoming · Apr 11, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023
@@ -145,16 +145,54 @@ def _parse_sections(self, title, body):
             tags = body.css(f'h{head_level}')
             for tag in tags:
                 try:
-                    title, id = self._parse_section_title(tag)
+                    title, _id = self._parse_section_title(tag)
                     next_tag = self._get_header_container(tag).next
                     content, _ = self._parse_section_content(next_tag, depth=2)
                     yield {
-                        'id': id,
-                        'title': title,
-                        'content': content,
+                        "id": _id,
+                        "title": title,
+                        "content": content,
                     }
                 except Exception as e:
-                    log.info('Unable to index section.', section=str(e))
+                    log.info("Unable to index section.", section=str(e))
+
+        # All terms in dls are treated as sections.
+        dls = body.css("dl")
+        for dl in dls:
+            # Select all dts with id defined
+            dts = dl.css('dt[id]:not([id=""])')
+
+            # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/dt
+            # multiple <dt> elements in a row indicate several terms that are
+            # all defined by the immediate next <dd> element.
+            for dt in dts:
+                try:
+                    title, _id = self._parse_dt(dt)
+                    # https://developer.mozilla.org/en-US/docs/Web/CSS/General_sibling_combinator
+                    dd = dt.css_first("dt ~ dd")
+                    # We only index a dt with an id attribute and an accompanying dd
+                    if not dd or not _id:
+                        continue
+
+                    # The content of the <dt> section is the content of the accompanying <dd>
+                    content = dd.text()
+                    yield {
+                        "id": _id,
+                        "title": title,
+                        "content": content,
+                    }
+                except Exception as e:
+                    log.info("Unable to index dt section.", section=str(e))
+
+    def _parse_dt(self, tag):
+        """
+        Parses a definition term <dt>.
+
+        If the <dt> does not have an id attribute, it cannot be referenced.
+        This should be understood by the caller.
+        """
+        section_id = tag.attributes.get("id", "")
+        return self._parse_content(tag.text()), section_id
 
     def _get_sections(self, title, body):
         """Get the first `self.max_inner_documents` sections."""
@@ -407,7 +445,6 @@ def _process_fjson(self, fjson_path):
         sections = []
         path = ''
         title = ''
-        domain_data = {}
 
         if 'current_page_name' in data:
             path = data['current_page_name']
@@ -435,34 +472,19 @@ def _process_fjson(self, fjson_path):
                 try:
                     # Create a new html object, since the previous one could have been modified.
                     body = HTMLParser(data["body"])
-                    domain_data = self._generate_domains_data(body)
                 except Exception:
                     log.info("Unable to index domains.", path=fjson_path)
         else:
             log.info('Unable to index content.', path=fjson_path)
 
         return {
-            'path': path,
-            'title': title,
-            'sections': sections,
-            'domain_data': domain_data,
+            "path": path,
+            "title": title,
+            "sections": sections,
+            # this used to contain content from <dl> but this is now handled in a generic parser
+            "domain_data": {},
         }
 
-    def _get_sphinx_domains(self, body):
-        """
-        Get all nodes that are a sphinx domain.
-
-        A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
-        dl tags that have the "footnote" class aren't domains.
-        """
-        domains = []
-        dl_tags = body.css("dl:has(dt[id])")
-        for tag in dl_tags:
-            classes = tag.attributes.get("class", "").split()
-            if "footnote" not in classes:
-                domains.append(tag)
-        return domains
-
     def _clean_body(self, body):
         """
         Removes sphinx domain nodes.
@@ -476,10 +498,9 @@ def _clean_body(self, body):
         # while we migrate the ID type of the sphinx domains table
         # https://github.com/readthedocs/readthedocs.org/pull/9482.
         nodes_to_be_removed = []
-        from readthedocs.projects.models import Feature
 
-        if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
-            nodes_to_be_removed = self._get_sphinx_domains(body)
+        # if not self.project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
+        #     nodes_to_be_removed = self._get_sphinx_domains(body)
 
         # TODO: see if we really need to remove these
         # remove `Table of Contents` elements
@@ -491,57 +512,6 @@ def _clean_body(self, body):
 
         return body
 
-    def _generate_domains_data(self, body):
-        """
-        Generate sphinx domain objects' docstrings.
-
-        Returns a dict with the generated data.
-        The returned dict is in the following form::
-
-            {
-                "domain-id-1": "docstrings for the domain-id-1",
-                "domain-id-2": "docstrings for the domain-id-2",
-            }
-
-        .. note::
-
-           Only the first `self.max_inner_documents` domains are returned.
-        """
-
-        domain_data = {}
-        dl_tags = self._get_sphinx_domains(body)
-        number_of_domains = 0
-
-        for dl_tag in dl_tags:
-
-            dt = dl_tag.css('dt')
-            dd = dl_tag.css('dd')
-
-            # len(dt) should be equal to len(dd)
-            # because these tags go together.
-            for title, desc in zip(dt, dd):
-                try:
-                    id_ = title.attributes.get('id', '')
-                    if id_:
-                        # Create a copy of the node,
-                        # since _parse_domain_tag will modify it.
-                        copy_desc = HTMLParser(desc.html).body.child
-                        docstrings = self._parse_domain_tag(copy_desc)
-                        domain_data[id_] = docstrings
-                        number_of_domains += 1
-                    if number_of_domains >= self.max_inner_documents:
-                        log.warning(
-                            'Limit of inner domains exceeded.',
-                            project_slug=self.project.slug,
-                            version_slug=self.version.slug,
-                            limit=self.max_inner_documents,
-                        )
-                        break
-                except Exception:
-                    log.exception('Error parsing docstring for domains')
-
-        return domain_data
-
     def _parse_domain_tag(self, tag):
         """Returns the text from the description tag of the domain."""
 

@@ -6,5 +6,14 @@
   </head>
   <body>
     Content of the body.
+    <dl>
+      <dt id="love">Love</dt>
+      <dd>To code is to love</dd>
+      <dt>No section for me</dt>
+      <dd>This term does not have an ID so it's not gonna get its own section</dd>
+      <dt id="code">Code</dt>
+      <dt id="docs">Docs</dt>
+      <dd>Code and docs are like love, they are 4-letter words.</dd>
+    </dl>
   </body>
 </html>
@@ -6,7 +6,22 @@
       {
         "id": "",
         "title": "Title of the page",
-        "content": "Content of the body."
+        "content": "Content of the body. Love To code is to love No section for me This term does not have an ID so it's not gonna get its own section Code Docs Code and docs are like love, they are 4-letter words."
+      },
+      {
+        "id": "love",
+        "title": "Love",
+        "content": "To code is to love"
+      },
+      {
+        "id": "code",
+        "title": "Code",
+        "content": "Code and docs are like love, they are 4-letter words."
+      },
+      {
+        "id": "docs",
+        "title": "Docs",
+        "content": "Code and docs are like love, they are 4-letter words."
       }
     ],
     "domain_data": {}