From 6f6cff092dd682a60065ad4990dc11aff90da893 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 17:15:53 +0100 Subject: [PATCH 01/12] Remove deprecated debug property Breaking BC for Readability 3.0. --- src/Readability.php | 2 -- tests/ReadabilityTest.php | 21 --------------------- 2 files changed, 23 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 836a333..aa49727 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -37,8 +37,6 @@ class Readability implements LoggerAwareInterface public $url = null; // preserves more content (experimental) public $lightClean = true; - // no more used, keept to avoid BC - public $debug = false; public $tidied = false; /** diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 3fb9dc0..2fee95f 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -115,7 +115,6 @@ public function testInitDivP(): void public function testInitDiv(): void { $readability = $this->getReadability('
' . str_repeat('This is the awesome content :)', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -129,7 +128,6 @@ public function testInitDiv(): void public function testWithFootnotes(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -146,7 +144,6 @@ public function testWithFootnotes(): void public function testStandardClean(): void { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . 'will NOT be removed
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -163,7 +160,6 @@ public function testStandardClean(): void public function testWithIframe(): void { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -178,7 +174,6 @@ public function testWithIframe(): void public function testWithArticle(): void { $readability = $this->getReadability('

' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -193,7 +188,6 @@ public function testWithArticle(): void public function testWithAside(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -208,7 +202,6 @@ public function testWithAside(): void public function testWithClasses(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -223,7 +216,6 @@ public function testWithClasses(): void public function testWithClassesWithoutLightClean(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -239,7 +231,6 @@ public function testWithClassesWithoutLightClean(): void public function testWithTd(): void { $readability = $this->getReadability('' . str_repeat('', 7) . '

This is an awesome text with some links, here there are the awesome

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -252,7 +243,6 @@ public function testWithTd(): void public function testWithSameClasses(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '
This text is also an awesome text and you should know that !
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -266,7 +256,6 @@ public function testWithSameClasses(): void public function testWithScript(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -280,7 +269,6 @@ public function testWithScript(): void public function testTitle(): void { $readability = $this->getReadability('this is my title
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -294,7 +282,6 @@ public function testTitle(): void public function testTitleWithDash(): void { $readability = $this->getReadability(' title2 - title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -308,7 +295,6 @@ public function testTitleWithDash(): void public function testTitleWithDoubleDot(): void { $readability = $this->getReadability(' title2 : title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -322,7 +308,6 @@ public function testTitleWithDoubleDot(): void public function testTitleTooShortUseH1(): void { $readability = $this->getReadability('too short

this is my h1 title !

' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -369,7 +354,6 @@ public function testAutoClosingIframeNotThrowingException(): void '; $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030'); - $readability->debug = true; $res = $readability->init(); @@ -437,7 +421,6 @@ public function testAppendIdAlreadyHere(): void '; $readability = $this->getReadability($data, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); @@ -474,7 +457,6 @@ public function testChildNodeGoneNull(): void $html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -487,7 +469,6 @@ public function testKeepFootnotes(): void $html = (string) file_get_contents('tests/fixtures/keepFootnotes.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -501,7 +482,6 @@ public function testWithWipedBody(): void $html = (string) file_get_contents('tests/fixtures/wipedBody.html'); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -540,7 +520,6 @@ public function dataForVisibleNode(): array public function testVisibleNode(string $content, bool $shouldBeVisible): void { $readability = $this->getReadability($content, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); if ($shouldBeVisible) { From dfa7cb08cdcb141b8bb75dd1ee019c422ce3d4e7 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 17:57:49 +0100 Subject: [PATCH 02/12] Fix missing return value in grabArticle Not sure if this is expected but at least it works the same as before. --- src/Readability.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Readability.php b/src/Readability.php index aa49727..22b2ee1 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1039,7 +1039,7 @@ protected function grabArticle(?\DOMElement $page = null) foreach ($ancestors as $level => $ancestor) { if (!$ancestor->nodeName || !$ancestor->parentNode) { - return; + return false; } if (!$ancestor->hasAttribute('readability')) { From c83d33738aa84752321c57401dcb4d2a6ed8269d Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 18:33:56 +0100 Subject: [PATCH 03/12] Use helpers for content score manipulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DOMAttr::$value` must be a `string`. Let’s add helpers for manipulating the `readability` attribute so that we do not have to keep casting it from and to `string` in order to appease `strict_types`. --- src/Readability.php | 59 ++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 22b2ee1..e7e1db5 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -616,7 +616,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void for ($i = $curTagsLength - 1; $i >= 0; --$i) { $node = $tagsList->item($i); $weight = $this->getWeight($node); - $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; + $contentScore = self::getContentScore($node); $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); // XXX Incomplete implementation @@ -829,29 +829,26 @@ protected function initializeNode(\DOMElement $node): void return; } - $readability = $this->dom->createAttribute('readability'); - // this is our contentScore - $readability->value = 0; - $node->setAttributeNode($readability); + $contentScore = 0; // using strtoupper just in case switch (strtoupper($node->tagName)) { case 'ARTICLE': - $readability->value += 15; + $contentScore += 15; // no break case 'DIV': - $readability->value += 5; + $contentScore += 5; break; case 'PRE': case 'CODE': case 'TD': case 'BLOCKQUOTE': case 'FIGURE': - $readability->value += 3; + $contentScore += 3; break; case 'SECTION': // often misused - // $readability->value += 2; + // $contentScore += 2; break; case 'OL': case 'UL': @@ -859,7 +856,7 @@ protected function initializeNode(\DOMElement $node): void case 'DD': case 'DT': case 'LI': - $readability->value -= 3; + $contentScore -= 3; break; case 'ASIDE': case 'FOOTER': @@ -870,7 +867,7 @@ protected function initializeNode(\DOMElement $node): void case 'TEXTAREA': case 'INPUT': case 'NAV': - $readability->value -= 3; + $contentScore -= 3; break; case 'H1': case 'H2': @@ -880,11 +877,15 @@ protected function initializeNode(\DOMElement $node): void case 'H6': case 'TH': case 'HGROUP': - $readability->value -= 5; + $contentScore -= 5; break; } - $readability->value += $this->getWeight($node); + $contentScore += $this->getWeight($node); + + $readability = $this->dom->createAttribute('readability'); + $readability->value = (string) $contentScore; + $node->setAttributeNode($readability); } /** @@ -1054,7 +1055,8 @@ protected function grabArticle(?\DOMElement $page = null) } else { $scoreDivider = $level * 3; } - $ancestor->getAttributeNode('readability')->value += $contentScore / $scoreDivider; + + self::updateContentScore($ancestor, fn ($prevScore) => $prevScore + $contentScore / $scoreDivider); } } @@ -1069,7 +1071,7 @@ protected function grabArticle(?\DOMElement $page = null) $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { - $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node)); $node->parentNode->removeChild($node); } } @@ -1098,14 +1100,13 @@ protected function grabArticle(?\DOMElement $page = null) // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. - $readability = $item->getAttributeNode('readability'); - $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP); + self::updateContentScore($item, fn ($prevScore) => round($prevScore * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP)); for ($t = 0; $t < 5; ++$t) { $aTopCandidate = $topCandidates[$t]; - if (!$aTopCandidate || $readability->value > (int) $aTopCandidate->getAttribute('readability')) { - $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); + if (!$aTopCandidate || self::getContentScore($item) > self::getContentScore($aTopCandidate)) { + $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . self::getContentScore($item)); array_splice($topCandidates, $t, 0, [$item]); if (\count($topCandidates) > 5) { array_pop($topCandidates); @@ -1376,6 +1377,26 @@ protected function reinitBody(): void } } + /** + * Updates the content score for the given element using the provided function. + * + * @param callable(float): float $f + */ + private static function updateContentScore(\DOMElement $element, callable $f): void + { + $readabilityAttr = $element->getAttributeNode('readability'); + $prevScore = (float) $readabilityAttr->value; + $readabilityAttr->value = (string) $f($prevScore); + } + + /** + * Gets the content score for given element. + */ + private static function getContentScore(\DOMElement $element): float + { + return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0; + } + /** * Load HTML in a DOMDocument. * Apply Pre filters From 7a2ae65f7289fb2226d1b48c386f979df86c62fa Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 17:15:31 +0100 Subject: [PATCH 04/12] Use PHP type hints instead of PHPDoc PHP 7.4 supports property type hints. - `getInnerText` actually accepts `DOMNode`, not `DOMElement`, since it can be e.g. `DOMText`. Really, we just need the `textContent` property. - Since we now use property type hints, we can drop the `no_null_property_initialization` PHP-CS-Fixer rule. --- .php-cs-fixer.php | 2 - src/Readability.php | 135 ++++++++++++++++++++++++-------------- tests/ReadabilityTest.php | 8 +-- 3 files changed, 88 insertions(+), 57 deletions(-) diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index 5f09a0c..d0c927c 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -28,8 +28,6 @@ 'concat_space' => ['spacing' => 'one'], // Pulled in by @Symfony:risky but we still support PHP 7.4 'modernize_strpos' => false, - // Pulled in by @Symfony, we cannot add property types until we bump PHP to ≥ 7.4 - 'no_null_property_initialization' => false, ]) ->setFinder($finder) ; diff --git a/src/Readability.php b/src/Readability.php index e7e1db5..e87aec6 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -24,26 +24,36 @@ class Readability implements LoggerAwareInterface public const MIN_ARTICLE_LENGTH = 200; public const MIN_NODE_LENGTH = 80; public const MAX_LINK_DENSITY = 0.25; - public $convertLinksToFootnotes = false; - public $revertForcedParagraphElements = false; - public $articleTitle; - public $articleContent; - public $original_html; + + public bool $convertLinksToFootnotes = false; + public bool $revertForcedParagraphElements = false; + + public ?\DOMElement $articleTitle; + + public ?\DOMElement $articleContent; + + public ?string $original_html; + + public ?\DOMDocument $dom; + /** - * @var \DOMDocument + * @var ?string URL where HTML was retrieved */ - public $dom; - // optional - URL where HTML was retrieved - public $url = null; - // preserves more content (experimental) - public $lightClean = true; - public $tidied = false; + public ?string $url = null; /** - * All of the regular expressions in use within readability. + * @var bool preserves more content (experimental) + */ + public bool $lightClean = true; + + public bool $tidied = false; + + /** + * @var array All of the regular expressions in use within readability. + * * Defined up here so we don't instantiate them repeatedly in loops. */ - public $regexps = [ + public array $regexps = [ 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', @@ -55,10 +65,18 @@ class Readability implements LoggerAwareInterface 'hasContent' => '/\S$/', 'isNotVisible' => '/display\s*:\s*none/', ]; - public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; - // The commented out elements qualify as phrasing content but tend to be - // removed by readability when put into paragraphs, so we ignore them here. - public $phrasingElements = [ + + /** + * @var array + */ + public array $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; + + /** + * @var array + */ + public array $phrasingElements = [ + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. // "CANVAS", "IFRAME", "SVG", "VIDEO", 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA', 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL', @@ -66,7 +84,11 @@ class Readability implements LoggerAwareInterface 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB', 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR', ]; - public $tidy_config = [ + + /** + * @var array + */ + public array $tidy_config = [ 'tidy-mark' => false, 'vertical-space' => false, 'doctype' => 'omit', @@ -90,21 +112,41 @@ class Readability implements LoggerAwareInterface 'output-encoding' => 'utf8', 'hide-comments' => true, ]; - // article domain regexp for calibration - protected $domainRegExp = null; - protected $body = null; - // Cache the body HTML in case we need to re-use it later - protected $bodyCache = null; - // 1 | 2 | 4; // Start with all processing flags set. - protected $flags = 7; - // indicates whether we were able to extract or not - protected $success = false; - protected $logger; - protected $parser; - protected $html; - protected $useTidy; - // raw HTML filters - protected $pre_filters = [ + + /** + * @var ?string article domain regexp for calibration + */ + protected ?string $domainRegExp = null; + + protected ?\DOMElement $body = null; + + /** + * @var ?string Cache the body HTML in case we need to re-use it later + */ + protected ?string $bodyCache = null; + + /** + * @var int-mask-of start with all processing flags set + */ + protected int $flags = self::FLAG_STRIP_UNLIKELYS | self::FLAG_WEIGHT_ATTRIBUTES | self::FLAG_CLEAN_CONDITIONALLY; + + /** + * @var bool indicates whether we were able to extract or not + */ + protected bool $success = false; + + protected LoggerInterface $logger; + + protected string $parser; + + protected string $html; + + protected bool $useTidy; + + /** + * @var array raw HTML filters + */ + protected array $pre_filters = [ // remove spans as we redefine styles and they're probably special-styled '!]*>!is' => '', // HACK: firewall-filtered content @@ -116,8 +158,11 @@ class Readability implements LoggerAwareInterface // replace fonts to spans '!<(/?)font[^>]*>!is' => '<\\1span>', ]; - // output HTML filters - protected $post_filters = [ + + /** + * @var array output HTML filters + */ + protected array $post_filters = [ // replace excessive br's '/\s*

'articleTitle; } /** * Get article content element. - * - * @return \DOMElement */ - public function getContent() + public function getContent(): \DOMElement { return $this->articleContent; } @@ -452,12 +493,8 @@ public function prepArticle(\DOMNode $articleContent): void /** * Get the inner text of a node. * This also strips out any excess whitespace to be found. - * - * @param \DOMElement $e - * @param bool $normalizeSpaces (default: true) - * @param bool $flattenLines (default: false) */ - public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string + public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $flattenLines = false): string { if (null === $e || !isset($e->textContent) || '' === $e->textContent) { return ''; @@ -750,10 +787,8 @@ public function removeFlag(int $flag): void /** * Get the article title as an H1. - * - * @return \DOMElement */ - protected function getArticleTitle() + protected function getArticleTitle(): \DOMElement { try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 2fee95f..cce4568 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -10,10 +10,8 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase { - /** @var TestHandler */ - public $logHandler; - /** @var LoggerInterface */ - public $logger; + public TestHandler $logHandler; + public LoggerInterface $logger; /** * @requires extension tidy @@ -323,7 +321,7 @@ public function testAutoClosingIframeNotThrowingException(): void $oldErrorReporting = error_reporting(\E_ALL | \E_STRICT); $oldDisplayErrors = ini_set('display_errors', '1'); // dummy function to be used to the next test - set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext) { + set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext): bool { throw new \Exception($errstr, $errno); }, \E_ALL | \E_STRICT); From 1acfc6fede1e256a6a2e3c1ecc777cdd3168e68e Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 21:51:43 +0100 Subject: [PATCH 05/12] Use JSLikeHTMLElement in type hints It is more specific then DOMElement. This allows us to get rid of the assertions in tests. --- src/Readability.php | 12 ++++++------ tests/ReadabilityTest.php | 41 --------------------------------------- 2 files changed, 6 insertions(+), 47 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index e87aec6..8cf7202 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -28,9 +28,9 @@ class Readability implements LoggerAwareInterface public bool $convertLinksToFootnotes = false; public bool $revertForcedParagraphElements = false; - public ?\DOMElement $articleTitle; + public ?JSLikeHTMLElement $articleTitle; - public ?\DOMElement $articleContent; + public ?JSLikeHTMLElement $articleContent; public ?string $original_html; @@ -203,7 +203,7 @@ public function setLogger(LoggerInterface $logger): void /** * Get article title element. */ - public function getTitle(): \DOMElement + public function getTitle(): JSLikeHTMLElement { return $this->articleTitle; } @@ -211,7 +211,7 @@ public function getTitle(): \DOMElement /** * Get article content element. */ - public function getContent(): \DOMElement + public function getContent(): JSLikeHTMLElement { return $this->articleContent; } @@ -394,7 +394,7 @@ public function addFootnotes(\DOMElement $articleContent): void */ public function prepArticle(\DOMNode $articleContent): void { - if (!$articleContent instanceof \DOMElement) { + if (!$articleContent instanceof JSLikeHTMLElement) { return; } @@ -590,7 +590,7 @@ public function getWeight(\DOMElement $e): int /** * Remove extraneous break tags from a node. */ - public function killBreaks(\DOMElement $node): void + public function killBreaks(JSLikeHTMLElement $node): void { $html = $node->getInnerHTML(); $html = preg_replace($this->regexps['killBreaks'], '
', $html); diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index cce4568..21f9bf7 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -5,7 +5,6 @@ use Monolog\Handler\TestHandler; use Monolog\Logger; use Psr\Log\LoggerInterface; -use Readability\JSLikeHTMLElement; use Readability\Readability; class ReadabilityTest extends \PHPUnit\Framework\TestCase @@ -78,8 +77,6 @@ public function testInitNoContent(): void $res = $readability->init(); $this->assertFalse($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml()); } @@ -90,8 +87,6 @@ public function testInitP(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('

getContent()->getInnerHtml()); @@ -103,8 +98,6 @@ public function testInitDivP(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('
getContent()->getInnerHtml()); @@ -116,8 +109,6 @@ public function testInitDiv(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('
getContent()->getInnerHtml()); @@ -130,8 +121,6 @@ public function testWithFootnotes(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('
getContent()->getInnerHtml()); @@ -146,8 +135,6 @@ public function testStandardClean(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('
getContent()->getInnerHtml()); @@ -161,8 +148,6 @@ public function testWithIframe(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('
getContent()->getInnerHtml()); @@ -175,8 +160,6 @@ public function testWithArticle(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); @@ -189,8 +172,6 @@ public function testWithAside(): void $res = $readability->init(); $this->assertTrue($res); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent()); - $this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle()); $this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('