JSHtml

jtojnar · jtojnar · commit 8d17a88d7796 · 2024-03-16T22:17:47.000+01:00
diff --git a/src/Readability.php b/src/Readability.php
@@ -120,7 +120,7 @@ class Readability implements LoggerAwareInterface
      */
     protected ?string $domainRegExp = null;
 
-    protected ?\DOMElement $body = null;
+    protected ?JSLikeHTMLElement $body = null;
 
     /**
      * @var ?string Cache the body HTML in case we need to re-use it later
@@ -262,6 +262,7 @@ public function init(): bool
 
         // Assume successful outcome
         $this->success = true;
+        /** @var \DOMNodeList<JSLikeHTMLElement> */
         $bodyElems = $this->dom->getElementsByTagName('body');
 
         // WTF multiple body nodes?
@@ -284,7 +285,7 @@ public function init(): bool
         $articleTitle = $this->getArticleTitle();
         $articleContent = $this->grabArticle();
 
-        if (!$articleContent) {
+        if (null === $articleContent) {
             $this->success = false;
             $articleContent = $this->dom->createElement('div');
             $articleContent->setAttribute('class', 'readability-content');
@@ -423,7 +424,7 @@ public function prepArticle(\DOMNode $articleContent): void
         }
 
         // Remove service data-candidate attribute.
-        /** @var \DOMNodeList<\DOMElement> */
+        /** @var \DOMNodeList<JSLikeHTMLElement> */
         $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
         for ($i = $elems->length - 1; $i >= 0; --$i) {
             $elems->item($i)->removeAttribute('data-candidate');
@@ -519,7 +520,7 @@ public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $f
     /**
      * Remove the style attribute on every $e and under.
      */
-    public function cleanStyles(\DOMElement $e): void
+    public function cleanStyles(JSLikeHTMLElement $e): void
     {
         if (\is_object($e)) {
             $elems = $e->getElementsByTagName('*');
@@ -552,7 +553,7 @@ public function getWordCount(string $text): int
      * This is the amount of text that is inside a link divided by the total text in the node.
      * Can exclude external references to differentiate between simple text and menus/infoblocks.
      */
-    public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
+    public function getLinkDensity(JSLikeHTMLElement $e, bool $excludeExternal = false): float
     {
         $links = $e->getElementsByTagName('a');
         $textLength = mb_strlen($this->getInnerText($e, true, true));
@@ -575,7 +576,7 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f
     /**
      * Get an element relative weight.
      */
-    public function getWeight(\DOMElement $e): int
+    public function getWeight(JSLikeHTMLElement $e): int
     {
         if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
             return 0;
@@ -606,7 +607,7 @@ public function killBreaks(JSLikeHTMLElement $node): void
      *
      * Updated 2012-09-18 to preserve youtube/vimeo iframes
      */
-    public function clean(\DOMElement $e, string $tag): void
+    public function clean(JSLikeHTMLElement $e, string $tag): void
     {
         $targetList = $e->getElementsByTagName($tag);
         $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@@ -638,7 +639,7 @@ public function clean(\DOMElement $e, string $tag): void
      * "Fishy" is an algorithm based on content length, classnames,
      * link density, number of images & embeds, etc.
      */
-    public function cleanConditionally(\DOMElement $e, string $tag): void
+    public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void
     {
         if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
             return;
@@ -751,7 +752,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
     /**
      * Clean out spurious headers from an Element. Checks things like classnames and link density.
      */
-    public function cleanHeaders(\DOMElement $e): void
+    public function cleanHeaders(JSLikeHTMLElement $e): void
     {
         for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
             $headers = $e->getElementsByTagName('h' . $headerIndex);
@@ -791,7 +792,7 @@ public function removeFlag(int $flag): void
     /**
      * Get the article title as an H1.
      */
-    protected function getArticleTitle(): \DOMElement
+    protected function getArticleTitle(): JSLikeHTMLElement
     {
         try {
             $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
@@ -861,7 +862,7 @@ protected function prepDocument(): void
      * Initialize a node with the readability object. Also checks the
      * className/id for special names to add to its score.
      */
-    protected function initializeNode(\DOMElement $node): void
+    protected function initializeNode(JSLikeHTMLElement $node): void
     {
         if (!isset($node->tagName)) {
             return;
@@ -929,10 +930,8 @@ protected function initializeNode(\DOMElement $node): void
     /**
      * Using a variety of metrics (content score, classname, element types), find the content that is
      * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
-     *
-     * @return \DOMElement|false
      */
-    protected function grabArticle(?\DOMElement $page = null)
+    protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElement
     {
         if (!$page) {
             $page = $this->dom;
@@ -1078,7 +1077,7 @@ protected function grabArticle(?\DOMElement $page = null)
 
             foreach ($ancestors as $level => $ancestor) {
                 if (!$ancestor->nodeName || !$ancestor->parentNode) {
-                    return false;
+                    return null;
                 }
 
                 if (!$ancestor->hasAttribute('readability')) {
@@ -1103,13 +1102,13 @@ protected function grabArticle(?\DOMElement $page = null)
          * This is faster to do before scoring but safer after.
          */
         if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
-            /** @var \DOMNodeList<\DOMElement> */
+            /** @var \DOMNodeList<JSLikeHTMLElement> */
             $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
 
             for ($c = $candidates->length - 1; $c >= 0; --$c) {
                 $node = $candidates->item($c);
                 // node should be readable but not inside of an article otherwise it's probably non-readable block
-                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof \DOMElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
+                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof JSLikeHTMLElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
                     $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
                     $node->parentNode->removeChild($node);
                 }
@@ -1130,7 +1129,7 @@ protected function grabArticle(?\DOMElement $page = null)
         $topCandidates = array_fill(0, 5, null);
         if ($xpath) {
             // Using array of DOMElements after deletion is a path to DOOMElement.
-            /** @var \DOMNodeList<\DOMElement> */
+            /** @var \DOMNodeList<JSLikeHTMLElement> */
             $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
             $this->logger->debug('Candidates: ' . $candidates->length);
 
@@ -1157,7 +1156,7 @@ protected function grabArticle(?\DOMElement $page = null)
             }
         }
 
-        /** @var \DOMNodeList<\DOMElement> */
+        /** @var \DOMNodeList<JSLikeHTMLElement> */
         $topCandidates = array_filter(
             $topCandidates,
             fn ($v, $idx) => 0 === $idx || null !== $v,
@@ -1250,7 +1249,7 @@ protected function grabArticle(?\DOMElement $page = null)
         if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
             $up = $topCandidate;
 
-            if ($up->parentNode instanceof \DOMElement) {
+            if ($up->parentNode instanceof JSLikeHTMLElement) {
                 $up = $up->parentNode;
 
                 if (0 === strcasecmp($up->tagName, 'table')) {
@@ -1280,19 +1279,19 @@ protected function grabArticle(?\DOMElement $page = null)
             $siblingNode = $siblingNodes->item($s);
             $siblingNodeName = $siblingNode->nodeName;
             $append = false;
-            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
 
             if ($siblingNode->isSameNode($topCandidate)) {
                 $append = true;
             } else {
                 $contentBonus = 0;
 
                 // Give a bonus if sibling nodes and top candidates have the same classname.
-                if ($siblingNode instanceof \DOMElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
+                if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
                     $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
                 }
 
-                if ($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
+                if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
                     $append = true;
                 } elseif (0 === strcasecmp($siblingNodeName, 'p')) {
                     $linkDensity = (int) $this->getLinkDensity($siblingNode);
@@ -1369,7 +1368,7 @@ protected function grabArticle(?\DOMElement $page = null)
                 return $this->grabArticle($this->body);
             }
 
-            return false;
+            return null;
         }
 
         return $articleContent;
@@ -1379,7 +1378,7 @@ protected function grabArticle(?\DOMElement $page = null)
      * Get an element weight by attribute.
      * Uses regular expressions to tell if this element looks good or bad.
      */
-    protected function weightAttribute(\DOMElement $element, string $attribute): int
+    protected function weightAttribute(JSLikeHTMLElement $element, string $attribute): int
     {
         if (!$element->hasAttribute($attribute)) {
             return 0;
@@ -1423,7 +1422,7 @@ protected function reinitBody(): void
      *
      * @param callable(float): float $f
      */
-    private static function updateContentScore(\DOMElement $element, callable $f): void
+    private static function updateContentScore(JSLikeHTMLElement $element, callable $f): void
     {
         $readabilityAttr = $element->getAttributeNode('readability');
         $prevScore = (float) $readabilityAttr->value;
@@ -1433,7 +1432,7 @@ private static function updateContentScore(\DOMElement $element, callable $f): v
     /**
      * Gets the content score for given element.
      */
-    private static function getContentScore(\DOMElement $element): float
+    private static function getContentScore(JSLikeHTMLElement $element): float
     {
         return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0;
     }
@@ -1505,11 +1504,11 @@ private function loadHtml(): void
         $this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class);
     }
 
-    private function getAncestors(\DOMElement $node, int $maxDepth = 0): array
+    private function getAncestors(JSLikeHTMLElement $node, int $maxDepth = 0): array
     {
         $ancestors = [];
         $i = 0;
-        while ($node->parentNode instanceof \DOMElement) {
+        while ($node->parentNode instanceof JSLikeHTMLElement) {
             $ancestors[] = $node->parentNode;
             if (++$i === $maxDepth) {
                 break;
@@ -1537,7 +1536,7 @@ private function isPhrasingContent($node): bool
             );
     }
 
-    private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
+    private function hasSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): bool
     {
         if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
             return false;
@@ -1557,7 +1556,7 @@ private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
      * Tidy must be configured to not clean the input for this function to
      * work as expected, see $this->tidy_config['clean']
      */
-    private function isNodeVisible(\DOMElement $node): bool
+    private function isNodeVisible(JSLikeHTMLElement $node): bool
     {
         return !(
             $node->hasAttribute('style')