diff --git a/src/Readability.php b/src/Readability.php index 836a333..7378446 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -302,8 +302,7 @@ public function addFootnotes(\DOMElement $articleContent): void $articleLinks = $articleContent->getElementsByTagName('a'); $linkCount = 0; - for ($i = 0; $i < $articleLinks->length; ++$i) { - $articleLink = $articleLinks->item($i); + foreach ($articleLinks as $articleLink) { $footnoteLink = $articleLink->cloneNode(true); $refLink = $this->dom->createElement('a'); $footnote = $this->dom->createElement('li'); @@ -375,16 +374,15 @@ public function prepArticle(\DOMNode $articleContent): void * which is what they were before. */ $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); - for ($i = $elems->length - 1; $i >= 0; --$i) { - $e = $elems->item($i); + foreach (iterator_to_array($elems) as $e) { $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); } } // Remove service data-candidate attribute. $elems = $xpath->query('.//*[@data-candidate]', $articleContent); - for ($i = $elems->length - 1; $i >= 0; --$i) { - $elems->item($i)->removeAttribute('data-candidate'); + foreach ($elems as $elem) { + $elem->removeAttribute('data-candidate'); } // Clean out junk from the article content. @@ -416,9 +414,7 @@ public function prepArticle(\DOMNode $articleContent): void // Remove extra paragraphs. $articleParagraphs = $articleContent->getElementsByTagName('p'); - for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) { - $item = $articleParagraphs->item($i); - + foreach (iterator_to_array($articleParagraphs) as $item) { $imgCount = $item->getElementsByTagName('img')->length; $embedCount = $item->getElementsByTagName('embed')->length; $objectCount = $item->getElementsByTagName('object')->length; @@ -520,11 +516,12 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f $textLength = mb_strlen($this->getInnerText($e, true, true)); $linkLength = 0; - for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { - if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + $dRe = $this->domainRegExp; + foreach ($links as $link) { + if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) { continue; } - $linkLength += mb_strlen($this->getInnerText($links->item($i))); + $linkLength += mb_strlen($this->getInnerText($link)); } if ($textLength > 0 && $linkLength > 0) { @@ -573,10 +570,8 @@ public function clean(\DOMElement $e, string $tag): void $targetList = $e->getElementsByTagName($tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); - for ($y = $targetList->length - 1; $y >= 0; --$y) { + foreach (iterator_to_array($targetList) as $currentItem) { // Allow youtube and vimeo videos through as people usually want to see those. - $currentItem = $targetList->item($y); - if ($isEmbed) { $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href'); @@ -586,7 +581,7 @@ public function clean(\DOMElement $e, string $tag): void } // Then check the elements inside this element for the same. - if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) { + if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) { continue; } } @@ -607,7 +602,6 @@ public function cleanConditionally(\DOMElement $e, string $tag): void } $tagsList = $e->getElementsByTagName($tag); - $curTagsLength = $tagsList->length; /* * Gather counts for other typical elements embedded within. @@ -615,8 +609,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void * * TODO: Consider taking into account original contentScore here. */ - for ($i = $curTagsLength - 1; $i >= 0; --$i) { - $node = $tagsList->item($i); + foreach (iterator_to_array($tagsList) as $node) { $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); @@ -640,15 +633,15 @@ public function cleanConditionally(\DOMElement $e, string $tag): void $embedCount = 0; $embeds = $node->getElementsByTagName('embed'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } $embeds = $node->getElementsByTagName('iframe'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } @@ -718,9 +711,9 @@ public function cleanHeaders(\DOMElement $e): void for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i = $headers->length - 1; $i >= 0; --$i) { - if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); + foreach (iterator_to_array($headers) as $header) { + if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) { + $header->parentNode->removeChild($header); } } } @@ -811,13 +804,13 @@ protected function prepDocument(): void // Remove all style tags in head. $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + foreach (iterator_to_array($styleTags) as $styleTag) { + $styleTag->parentNode->removeChild($styleTag); } $linkTags = $this->dom->getElementsByTagName('link'); - for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); + foreach (iterator_to_array($linkTags) as $linkTag) { + $linkTag->parentNode->removeChild($linkTag); } } @@ -910,8 +903,7 @@ protected function grabArticle(?\DOMElement $page = null) $allElements = $page->getElementsByTagName('*'); - for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) { - $node = $allElements->item($nodeIndex); + foreach (iterator_to_array($allElements) as $node) { $tagName = $node->tagName; $nodeContent = $node->getInnerHTML(); @@ -924,7 +916,6 @@ protected function grabArticle(?\DOMElement $page = null) if (!$this->isNodeVisible($node)) { $this->logger->debug('Removing invisible node ' . $node->getNodePath()); $node->parentNode->removeChild($node); - --$nodeIndex; continue; } @@ -937,7 +928,6 @@ protected function grabArticle(?\DOMElement $page = null) ) { $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"'); $node->parentNode->removeChild($node); - --$nodeIndex; continue; } @@ -956,7 +946,6 @@ protected function grabArticle(?\DOMElement $page = null) $newNode->setInnerHtml($nodeContent); $node->parentNode->replaceChild($newNode, $node); - --$nodeIndex; $nodesToScore[] = $newNode; } catch (\Exception $e) { $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); @@ -1015,15 +1004,15 @@ protected function grabArticle(?\DOMElement $page = null) * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. */ - for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { - $ancestors = $this->getAncestors($nodesToScore[$pt], 5); + foreach ($nodesToScore as $nodeToScore) { + $ancestors = $this->getAncestors($nodeToScore, 5); // No parent node? Move on... if (0 === \count($ancestors)) { continue; } - $innerText = $this->getInnerText($nodesToScore[$pt]); + $innerText = $this->getInnerText($nodeToScore); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { @@ -1067,8 +1056,7 @@ protected function grabArticle(?\DOMElement $page = null) if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); + foreach (iterator_to_array($candidates) as $node) { // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); @@ -1076,11 +1064,6 @@ protected function grabArticle(?\DOMElement $page = null) } } - $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); - } unset($candidates); } @@ -1094,9 +1077,7 @@ protected function grabArticle(?\DOMElement $page = null) $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $this->logger->debug('Candidates: ' . $candidates->length); - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $item = $candidates->item($c); - + foreach (iterator_to_array($candidates) as $item) { // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. @@ -1231,13 +1212,7 @@ protected function grabArticle(?\DOMElement $page = null) $parentOfTopCandidate = $topCandidate->parentNode; $siblingNodes = $parentOfTopCandidate->childNodes; - if (0 === $siblingNodes->length) { - $siblingNodes = new \stdClass(); - $siblingNodes->length = 0; - } - - for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) { - $siblingNode = $siblingNodes->item($s); + foreach (iterator_to_array($siblingNodes) as $siblingNode) { $siblingNodeName = $siblingNode->nodeName; $append = false; $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); @@ -1280,13 +1255,9 @@ protected function grabArticle(?\DOMElement $page = null) } catch (\Exception $e) { $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.'); $nodeToAppend = $siblingNode; - --$s; - --$sl; } } else { $nodeToAppend = $siblingNode; - --$s; - --$sl; } // To ensure a node does not interfere with readability styles, remove its classnames & ids.