Skip to content

Commit 8d17a88

Browse files
committed
JSHtml
1 parent 1bfa56b commit 8d17a88

File tree

1 file changed

+30
-31
lines changed

1 file changed

+30
-31
lines changed

src/Readability.php

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ class Readability implements LoggerAwareInterface
120120
*/
121121
protected ?string $domainRegExp = null;
122122

123-
protected ?\DOMElement $body = null;
123+
protected ?JSLikeHTMLElement $body = null;
124124

125125
/**
126126
* @var ?string Cache the body HTML in case we need to re-use it later
@@ -262,6 +262,7 @@ public function init(): bool
262262

263263
// Assume successful outcome
264264
$this->success = true;
265+
/** @var \DOMNodeList<JSLikeHTMLElement> */
265266
$bodyElems = $this->dom->getElementsByTagName('body');
266267

267268
// WTF multiple body nodes?
@@ -284,7 +285,7 @@ public function init(): bool
284285
$articleTitle = $this->getArticleTitle();
285286
$articleContent = $this->grabArticle();
286287

287-
if (!$articleContent) {
288+
if (null === $articleContent) {
288289
$this->success = false;
289290
$articleContent = $this->dom->createElement('div');
290291
$articleContent->setAttribute('class', 'readability-content');
@@ -423,7 +424,7 @@ public function prepArticle(\DOMNode $articleContent): void
423424
}
424425

425426
// Remove service data-candidate attribute.
426-
/** @var \DOMNodeList<\DOMElement> */
427+
/** @var \DOMNodeList<JSLikeHTMLElement> */
427428
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
428429
for ($i = $elems->length - 1; $i >= 0; --$i) {
429430
$elems->item($i)->removeAttribute('data-candidate');
@@ -519,7 +520,7 @@ public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $f
519520
/**
520521
* Remove the style attribute on every $e and under.
521522
*/
522-
public function cleanStyles(\DOMElement $e): void
523+
public function cleanStyles(JSLikeHTMLElement $e): void
523524
{
524525
if (\is_object($e)) {
525526
$elems = $e->getElementsByTagName('*');
@@ -552,7 +553,7 @@ public function getWordCount(string $text): int
552553
* This is the amount of text that is inside a link divided by the total text in the node.
553554
* Can exclude external references to differentiate between simple text and menus/infoblocks.
554555
*/
555-
public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
556+
public function getLinkDensity(JSLikeHTMLElement $e, bool $excludeExternal = false): float
556557
{
557558
$links = $e->getElementsByTagName('a');
558559
$textLength = mb_strlen($this->getInnerText($e, true, true));
@@ -575,7 +576,7 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f
575576
/**
576577
* Get an element relative weight.
577578
*/
578-
public function getWeight(\DOMElement $e): int
579+
public function getWeight(JSLikeHTMLElement $e): int
579580
{
580581
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
581582
return 0;
@@ -606,7 +607,7 @@ public function killBreaks(JSLikeHTMLElement $node): void
606607
*
607608
* Updated 2012-09-18 to preserve youtube/vimeo iframes
608609
*/
609-
public function clean(\DOMElement $e, string $tag): void
610+
public function clean(JSLikeHTMLElement $e, string $tag): void
610611
{
611612
$targetList = $e->getElementsByTagName($tag);
612613
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@@ -638,7 +639,7 @@ public function clean(\DOMElement $e, string $tag): void
638639
* "Fishy" is an algorithm based on content length, classnames,
639640
* link density, number of images & embeds, etc.
640641
*/
641-
public function cleanConditionally(\DOMElement $e, string $tag): void
642+
public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void
642643
{
643644
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
644645
return;
@@ -751,7 +752,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
751752
/**
752753
* Clean out spurious headers from an Element. Checks things like classnames and link density.
753754
*/
754-
public function cleanHeaders(\DOMElement $e): void
755+
public function cleanHeaders(JSLikeHTMLElement $e): void
755756
{
756757
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
757758
$headers = $e->getElementsByTagName('h' . $headerIndex);
@@ -791,7 +792,7 @@ public function removeFlag(int $flag): void
791792
/**
792793
* Get the article title as an H1.
793794
*/
794-
protected function getArticleTitle(): \DOMElement
795+
protected function getArticleTitle(): JSLikeHTMLElement
795796
{
796797
try {
797798
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
@@ -861,7 +862,7 @@ protected function prepDocument(): void
861862
* Initialize a node with the readability object. Also checks the
862863
* className/id for special names to add to its score.
863864
*/
864-
protected function initializeNode(\DOMElement $node): void
865+
protected function initializeNode(JSLikeHTMLElement $node): void
865866
{
866867
if (!isset($node->tagName)) {
867868
return;
@@ -929,10 +930,8 @@ protected function initializeNode(\DOMElement $node): void
929930
/**
930931
* Using a variety of metrics (content score, classname, element types), find the content that is
931932
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
932-
*
933-
* @return \DOMElement|false
934933
*/
935-
protected function grabArticle(?\DOMElement $page = null)
934+
protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElement
936935
{
937936
if (!$page) {
938937
$page = $this->dom;
@@ -1078,7 +1077,7 @@ protected function grabArticle(?\DOMElement $page = null)
10781077

10791078
foreach ($ancestors as $level => $ancestor) {
10801079
if (!$ancestor->nodeName || !$ancestor->parentNode) {
1081-
return false;
1080+
return null;
10821081
}
10831082

10841083
if (!$ancestor->hasAttribute('readability')) {
@@ -1103,13 +1102,13 @@ protected function grabArticle(?\DOMElement $page = null)
11031102
* This is faster to do before scoring but safer after.
11041103
*/
11051104
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
1106-
/** @var \DOMNodeList<\DOMElement> */
1105+
/** @var \DOMNodeList<JSLikeHTMLElement> */
11071106
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
11081107

11091108
for ($c = $candidates->length - 1; $c >= 0; --$c) {
11101109
$node = $candidates->item($c);
11111110
// node should be readable but not inside of an article otherwise it's probably non-readable block
1112-
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof \DOMElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
1111+
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof JSLikeHTMLElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
11131112
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
11141113
$node->parentNode->removeChild($node);
11151114
}
@@ -1130,7 +1129,7 @@ protected function grabArticle(?\DOMElement $page = null)
11301129
$topCandidates = array_fill(0, 5, null);
11311130
if ($xpath) {
11321131
// Using array of DOMElements after deletion is a path to DOOMElement.
1133-
/** @var \DOMNodeList<\DOMElement> */
1132+
/** @var \DOMNodeList<JSLikeHTMLElement> */
11341133
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
11351134
$this->logger->debug('Candidates: ' . $candidates->length);
11361135

@@ -1157,7 +1156,7 @@ protected function grabArticle(?\DOMElement $page = null)
11571156
}
11581157
}
11591158

1160-
/** @var \DOMNodeList<\DOMElement> */
1159+
/** @var \DOMNodeList<JSLikeHTMLElement> */
11611160
$topCandidates = array_filter(
11621161
$topCandidates,
11631162
fn ($v, $idx) => 0 === $idx || null !== $v,
@@ -1250,7 +1249,7 @@ protected function grabArticle(?\DOMElement $page = null)
12501249
if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
12511250
$up = $topCandidate;
12521251

1253-
if ($up->parentNode instanceof \DOMElement) {
1252+
if ($up->parentNode instanceof JSLikeHTMLElement) {
12541253
$up = $up->parentNode;
12551254

12561255
if (0 === strcasecmp($up->tagName, 'table')) {
@@ -1280,19 +1279,19 @@ protected function grabArticle(?\DOMElement $page = null)
12801279
$siblingNode = $siblingNodes->item($s);
12811280
$siblingNodeName = $siblingNode->nodeName;
12821281
$append = false;
1283-
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
1282+
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
12841283

12851284
if ($siblingNode->isSameNode($topCandidate)) {
12861285
$append = true;
12871286
} else {
12881287
$contentBonus = 0;
12891288

12901289
// Give a bonus if sibling nodes and top candidates have the same classname.
1291-
if ($siblingNode instanceof \DOMElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
1290+
if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
12921291
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
12931292
}
12941293

1295-
if ($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
1294+
if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
12961295
$append = true;
12971296
} elseif (0 === strcasecmp($siblingNodeName, 'p')) {
12981297
$linkDensity = (int) $this->getLinkDensity($siblingNode);
@@ -1369,7 +1368,7 @@ protected function grabArticle(?\DOMElement $page = null)
13691368
return $this->grabArticle($this->body);
13701369
}
13711370

1372-
return false;
1371+
return null;
13731372
}
13741373

13751374
return $articleContent;
@@ -1379,7 +1378,7 @@ protected function grabArticle(?\DOMElement $page = null)
13791378
* Get an element weight by attribute.
13801379
* Uses regular expressions to tell if this element looks good or bad.
13811380
*/
1382-
protected function weightAttribute(\DOMElement $element, string $attribute): int
1381+
protected function weightAttribute(JSLikeHTMLElement $element, string $attribute): int
13831382
{
13841383
if (!$element->hasAttribute($attribute)) {
13851384
return 0;
@@ -1423,7 +1422,7 @@ protected function reinitBody(): void
14231422
*
14241423
* @param callable(float): float $f
14251424
*/
1426-
private static function updateContentScore(\DOMElement $element, callable $f): void
1425+
private static function updateContentScore(JSLikeHTMLElement $element, callable $f): void
14271426
{
14281427
$readabilityAttr = $element->getAttributeNode('readability');
14291428
$prevScore = (float) $readabilityAttr->value;
@@ -1433,7 +1432,7 @@ private static function updateContentScore(\DOMElement $element, callable $f): v
14331432
/**
14341433
* Gets the content score for given element.
14351434
*/
1436-
private static function getContentScore(\DOMElement $element): float
1435+
private static function getContentScore(JSLikeHTMLElement $element): float
14371436
{
14381437
return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0;
14391438
}
@@ -1505,11 +1504,11 @@ private function loadHtml(): void
15051504
$this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class);
15061505
}
15071506

1508-
private function getAncestors(\DOMElement $node, int $maxDepth = 0): array
1507+
private function getAncestors(JSLikeHTMLElement $node, int $maxDepth = 0): array
15091508
{
15101509
$ancestors = [];
15111510
$i = 0;
1512-
while ($node->parentNode instanceof \DOMElement) {
1511+
while ($node->parentNode instanceof JSLikeHTMLElement) {
15131512
$ancestors[] = $node->parentNode;
15141513
if (++$i === $maxDepth) {
15151514
break;
@@ -1537,7 +1536,7 @@ private function isPhrasingContent($node): bool
15371536
);
15381537
}
15391538

1540-
private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
1539+
private function hasSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): bool
15411540
{
15421541
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
15431542
return false;
@@ -1557,7 +1556,7 @@ private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
15571556
* Tidy must be configured to not clean the input for this function to
15581557
* work as expected, see $this->tidy_config['clean']
15591558
*/
1560-
private function isNodeVisible(\DOMElement $node): bool
1559+
private function isNodeVisible(JSLikeHTMLElement $node): bool
15611560
{
15621561
return !(
15631562
$node->hasAttribute('style')

0 commit comments

Comments
 (0)