Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 22 additions & 29 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,7 @@ public function addFootnotes(\DOMElement $articleContent): void
$articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0;

for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i);
foreach ($articleLinks as $articleLink) {
$footnoteLink = $articleLink->cloneNode(true);
$refLink = $this->dom->createElement('a');
$footnote = $this->dom->createElement('li');
Expand Down Expand Up @@ -383,8 +382,8 @@ public function prepArticle(\DOMNode $articleContent): void

// Remove service data-candidate attribute.
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate');
foreach ($elems as $elem) {
$elem->removeAttribute('data-candidate');
}

// Clean out junk from the article content.
Expand Down Expand Up @@ -520,11 +519,12 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f
$textLength = mb_strlen($this->getInnerText($e, true, true));
$linkLength = 0;

for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
$dRe = $this->domainRegExp;
foreach ($links as $link) {
if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) {
continue;
}
$linkLength += mb_strlen($this->getInnerText($links->item($i)));
$linkLength += mb_strlen($this->getInnerText($link));
}

if ($textLength > 0 && $linkLength > 0) {
Expand Down Expand Up @@ -586,7 +586,7 @@ public function clean(\DOMElement $e, string $tag): void
}

// Then check the elements inside this element for the same.
if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) {
if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) {
continue;
}
}
Expand Down Expand Up @@ -640,15 +640,15 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
$embedCount = 0;
$embeds = $node->getElementsByTagName('embed');

for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}

$embeds = $node->getElementsByTagName('iframe');
for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}
Expand Down Expand Up @@ -719,8 +719,9 @@ public function cleanHeaders(\DOMElement $e): void
$headers = $e->getElementsByTagName('h' . $headerIndex);

for ($i = $headers->length - 1; $i >= 0; --$i) {
if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
$headers->item($i)->parentNode->removeChild($headers->item($i));
$header = $headers->item($i);
if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) {
$header->parentNode->removeChild($header);
}
}
}
Expand Down Expand Up @@ -812,12 +813,14 @@ protected function prepDocument(): void
// Remove all style tags in head.
$styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length - 1; $i >= 0; --$i) {
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
$styleTag = $styleTags->item($i);
$styleTag->parentNode->removeChild($styleTag);
}

$linkTags = $this->dom->getElementsByTagName('link');
for ($i = $linkTags->length - 1; $i >= 0; --$i) {
$linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
$linkTag = $linkTags->item($i);
$linkTag->parentNode->removeChild($linkTag);
}
}

Expand Down Expand Up @@ -1015,15 +1018,15 @@ protected function grabArticle(?\DOMElement $page = null)
* A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density.
*/
for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
$ancestors = $this->getAncestors($nodesToScore[$pt], 5);
foreach ($nodesToScore as $nodeToScore) {
$ancestors = $this->getAncestors($nodeToScore, 5);

// No parent node? Move on...
if (0 === \count($ancestors)) {
continue;
}

$innerText = $this->getInnerText($nodesToScore[$pt]);
$innerText = $this->getInnerText($nodeToScore);

// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
Expand Down Expand Up @@ -1076,11 +1079,6 @@ protected function grabArticle(?\DOMElement $page = null)
}
}

$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);

for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
}
unset($candidates);
}

Expand Down Expand Up @@ -1231,11 +1229,6 @@ protected function grabArticle(?\DOMElement $page = null)
$parentOfTopCandidate = $topCandidate->parentNode;
$siblingNodes = $parentOfTopCandidate->childNodes;

if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass();
$siblingNodes->length = 0;
}

for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
$siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName;
Expand Down