@@ -120,7 +120,7 @@ class Readability implements LoggerAwareInterface
120
120
*/
121
121
protected ?string $ domainRegExp = null ;
122
122
123
- protected ?\ DOMElement $ body = null ;
123
+ protected ?JSLikeHTMLElement $ body = null ;
124
124
125
125
/**
126
126
* @var ?string Cache the body HTML in case we need to re-use it later
@@ -262,6 +262,7 @@ public function init(): bool
262
262
263
263
// Assume successful outcome
264
264
$ this ->success = true ;
265
+ /** @var \DOMNodeList<JSLikeHTMLElement> */
265
266
$ bodyElems = $ this ->dom ->getElementsByTagName ('body ' );
266
267
267
268
// WTF multiple body nodes?
@@ -284,7 +285,7 @@ public function init(): bool
284
285
$ articleTitle = $ this ->getArticleTitle ();
285
286
$ articleContent = $ this ->grabArticle ();
286
287
287
- if (! $ articleContent ) {
288
+ if (null === $ articleContent ) {
288
289
$ this ->success = false ;
289
290
$ articleContent = $ this ->dom ->createElement ('div ' );
290
291
$ articleContent ->setAttribute ('class ' , 'readability-content ' );
@@ -423,7 +424,7 @@ public function prepArticle(\DOMNode $articleContent): void
423
424
}
424
425
425
426
// Remove service data-candidate attribute.
426
- /** @var \DOMNodeList<\DOMElement > */
427
+ /** @var \DOMNodeList<JSLikeHTMLElement > */
427
428
$ elems = $ xpath ->query ('.//*[@data-candidate] ' , $ articleContent );
428
429
for ($ i = $ elems ->length - 1 ; $ i >= 0 ; --$ i ) {
429
430
$ elems ->item ($ i )->removeAttribute ('data-candidate ' );
@@ -519,7 +520,7 @@ public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $f
519
520
/**
520
521
* Remove the style attribute on every $e and under.
521
522
*/
522
- public function cleanStyles (\ DOMElement $ e ): void
523
+ public function cleanStyles (JSLikeHTMLElement $ e ): void
523
524
{
524
525
if (\is_object ($ e )) {
525
526
$ elems = $ e ->getElementsByTagName ('* ' );
@@ -552,7 +553,7 @@ public function getWordCount(string $text): int
552
553
* This is the amount of text that is inside a link divided by the total text in the node.
553
554
* Can exclude external references to differentiate between simple text and menus/infoblocks.
554
555
*/
555
- public function getLinkDensity (\ DOMElement $ e , bool $ excludeExternal = false ): float
556
+ public function getLinkDensity (JSLikeHTMLElement $ e , bool $ excludeExternal = false ): float
556
557
{
557
558
$ links = $ e ->getElementsByTagName ('a ' );
558
559
$ textLength = mb_strlen ($ this ->getInnerText ($ e , true , true ));
@@ -575,7 +576,7 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f
575
576
/**
576
577
* Get an element relative weight.
577
578
*/
578
- public function getWeight (\ DOMElement $ e ): int
579
+ public function getWeight (JSLikeHTMLElement $ e ): int
579
580
{
580
581
if (!$ this ->flagIsActive (self ::FLAG_WEIGHT_ATTRIBUTES )) {
581
582
return 0 ;
@@ -606,7 +607,7 @@ public function killBreaks(JSLikeHTMLElement $node): void
606
607
*
607
608
* Updated 2012-09-18 to preserve youtube/vimeo iframes
608
609
*/
609
- public function clean (\ DOMElement $ e , string $ tag ): void
610
+ public function clean (JSLikeHTMLElement $ e , string $ tag ): void
610
611
{
611
612
$ targetList = $ e ->getElementsByTagName ($ tag );
612
613
$ isEmbed = ('audio ' === $ tag || 'video ' === $ tag || 'iframe ' === $ tag || 'object ' === $ tag || 'embed ' === $ tag );
@@ -638,7 +639,7 @@ public function clean(\DOMElement $e, string $tag): void
638
639
* "Fishy" is an algorithm based on content length, classnames,
639
640
* link density, number of images & embeds, etc.
640
641
*/
641
- public function cleanConditionally (\ DOMElement $ e , string $ tag ): void
642
+ public function cleanConditionally (JSLikeHTMLElement $ e , string $ tag ): void
642
643
{
643
644
if (!$ this ->flagIsActive (self ::FLAG_CLEAN_CONDITIONALLY )) {
644
645
return ;
@@ -751,7 +752,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
751
752
/**
752
753
* Clean out spurious headers from an Element. Checks things like classnames and link density.
753
754
*/
754
- public function cleanHeaders (\ DOMElement $ e ): void
755
+ public function cleanHeaders (JSLikeHTMLElement $ e ): void
755
756
{
756
757
for ($ headerIndex = 1 ; $ headerIndex < 3 ; ++$ headerIndex ) {
757
758
$ headers = $ e ->getElementsByTagName ('h ' . $ headerIndex );
@@ -791,7 +792,7 @@ public function removeFlag(int $flag): void
791
792
/**
792
793
* Get the article title as an H1.
793
794
*/
794
- protected function getArticleTitle (): \ DOMElement
795
+ protected function getArticleTitle (): JSLikeHTMLElement
795
796
{
796
797
try {
797
798
$ curTitle = $ origTitle = $ this ->getInnerText ($ this ->dom ->getElementsByTagName ('title ' )->item (0 ));
@@ -861,7 +862,7 @@ protected function prepDocument(): void
861
862
* Initialize a node with the readability object. Also checks the
862
863
* className/id for special names to add to its score.
863
864
*/
864
- protected function initializeNode (\ DOMElement $ node ): void
865
+ protected function initializeNode (JSLikeHTMLElement $ node ): void
865
866
{
866
867
if (!isset ($ node ->tagName )) {
867
868
return ;
@@ -929,10 +930,8 @@ protected function initializeNode(\DOMElement $node): void
929
930
/**
930
931
* Using a variety of metrics (content score, classname, element types), find the content that is
931
932
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
932
- *
933
- * @return \DOMElement|false
934
933
*/
935
- protected function grabArticle (?\ DOMElement $ page = null )
934
+ protected function grabArticle (?JSLikeHTMLElement $ page = null ): ? JSLikeHTMLElement
936
935
{
937
936
if (!$ page ) {
938
937
$ page = $ this ->dom ;
@@ -1078,7 +1077,7 @@ protected function grabArticle(?\DOMElement $page = null)
1078
1077
1079
1078
foreach ($ ancestors as $ level => $ ancestor ) {
1080
1079
if (!$ ancestor ->nodeName || !$ ancestor ->parentNode ) {
1081
- return false ;
1080
+ return null ;
1082
1081
}
1083
1082
1084
1083
if (!$ ancestor ->hasAttribute ('readability ' )) {
@@ -1103,13 +1102,13 @@ protected function grabArticle(?\DOMElement $page = null)
1103
1102
* This is faster to do before scoring but safer after.
1104
1103
*/
1105
1104
if ($ this ->flagIsActive (self ::FLAG_STRIP_UNLIKELYS ) && $ xpath ) {
1106
- /** @var \DOMNodeList<\DOMElement > */
1105
+ /** @var \DOMNodeList<JSLikeHTMLElement > */
1107
1106
$ candidates = $ xpath ->query ('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)] ' , $ page ->documentElement );
1108
1107
1109
1108
for ($ c = $ candidates ->length - 1 ; $ c >= 0 ; --$ c ) {
1110
1109
$ node = $ candidates ->item ($ c );
1111
1110
// node should be readable but not inside of an article otherwise it's probably non-readable block
1112
- if ($ node ->hasAttribute ('readability ' ) && (int ) $ node ->getAttributeNode ('readability ' )->value < 40 && ($ node ->parentNode instanceof \DOMElement ? 0 !== strcasecmp ($ node ->parentNode ->tagName , 'article ' ) : true )) {
1111
+ if ($ node ->hasAttribute ('readability ' ) && (int ) $ node ->getAttributeNode ('readability ' )->value < 40 && ($ node ->parentNode instanceof JSLikeHTMLElement ? 0 !== strcasecmp ($ node ->parentNode ->tagName , 'article ' ) : true )) {
1113
1112
$ this ->logger ->debug ('Removing unlikely candidate (using note) ' . $ node ->getNodePath () . ' by " ' . $ node ->tagName . '" with readability ' . self ::getContentScore ($ node ));
1114
1113
$ node ->parentNode ->removeChild ($ node );
1115
1114
}
@@ -1130,7 +1129,7 @@ protected function grabArticle(?\DOMElement $page = null)
1130
1129
$ topCandidates = array_fill (0 , 5 , null );
1131
1130
if ($ xpath ) {
1132
1131
// Using array of DOMElements after deletion is a path to DOOMElement.
1133
- /** @var \DOMNodeList<\DOMElement > */
1132
+ /** @var \DOMNodeList<JSLikeHTMLElement > */
1134
1133
$ candidates = $ xpath ->query ('.//*[@data-candidate] ' , $ page ->documentElement );
1135
1134
$ this ->logger ->debug ('Candidates: ' . $ candidates ->length );
1136
1135
@@ -1157,7 +1156,7 @@ protected function grabArticle(?\DOMElement $page = null)
1157
1156
}
1158
1157
}
1159
1158
1160
- /** @var \DOMNodeList<\DOMElement > */
1159
+ /** @var \DOMNodeList<JSLikeHTMLElement > */
1161
1160
$ topCandidates = array_filter (
1162
1161
$ topCandidates ,
1163
1162
fn ($ v , $ idx ) => 0 === $ idx || null !== $ v ,
@@ -1250,7 +1249,7 @@ protected function grabArticle(?\DOMElement $page = null)
1250
1249
if (0 === strcasecmp ($ tagName , 'td ' ) || 0 === strcasecmp ($ tagName , 'tr ' )) {
1251
1250
$ up = $ topCandidate ;
1252
1251
1253
- if ($ up ->parentNode instanceof \DOMElement ) {
1252
+ if ($ up ->parentNode instanceof JSLikeHTMLElement ) {
1254
1253
$ up = $ up ->parentNode ;
1255
1254
1256
1255
if (0 === strcasecmp ($ up ->tagName , 'table ' )) {
@@ -1280,19 +1279,19 @@ protected function grabArticle(?\DOMElement $page = null)
1280
1279
$ siblingNode = $ siblingNodes ->item ($ s );
1281
1280
$ siblingNodeName = $ siblingNode ->nodeName ;
1282
1281
$ append = false ;
1283
- $ this ->logger ->debug ('Looking at sibling node: ' . $ siblingNode ->getNodePath () . (($ siblingNode instanceof \DOMElement && $ siblingNode ->hasAttribute ('readability ' )) ? (' with score ' . $ siblingNode ->getAttribute ('readability ' )) : '' ));
1282
+ $ this ->logger ->debug ('Looking at sibling node: ' . $ siblingNode ->getNodePath () . (($ siblingNode instanceof JSLikeHTMLElement && $ siblingNode ->hasAttribute ('readability ' )) ? (' with score ' . $ siblingNode ->getAttribute ('readability ' )) : '' ));
1284
1283
1285
1284
if ($ siblingNode ->isSameNode ($ topCandidate )) {
1286
1285
$ append = true ;
1287
1286
} else {
1288
1287
$ contentBonus = 0 ;
1289
1288
1290
1289
// Give a bonus if sibling nodes and top candidates have the same classname.
1291
- if ($ siblingNode instanceof \DOMElement && $ siblingNode ->getAttribute ('class ' ) === $ topCandidate ->getAttribute ('class ' ) && '' !== $ topCandidate ->getAttribute ('class ' )) {
1290
+ if ($ siblingNode instanceof JSLikeHTMLElement && $ siblingNode ->getAttribute ('class ' ) === $ topCandidate ->getAttribute ('class ' ) && '' !== $ topCandidate ->getAttribute ('class ' )) {
1292
1291
$ contentBonus += ((int ) $ topCandidate ->getAttribute ('readability ' )) * 0.2 ;
1293
1292
}
1294
1293
1295
- if ($ siblingNode instanceof \DOMElement && $ siblingNode ->hasAttribute ('readability ' ) && (((int ) $ siblingNode ->getAttribute ('readability ' )) + $ contentBonus ) >= $ siblingScoreThreshold ) {
1294
+ if ($ siblingNode instanceof JSLikeHTMLElement && $ siblingNode ->hasAttribute ('readability ' ) && (((int ) $ siblingNode ->getAttribute ('readability ' )) + $ contentBonus ) >= $ siblingScoreThreshold ) {
1296
1295
$ append = true ;
1297
1296
} elseif (0 === strcasecmp ($ siblingNodeName , 'p ' )) {
1298
1297
$ linkDensity = (int ) $ this ->getLinkDensity ($ siblingNode );
@@ -1369,7 +1368,7 @@ protected function grabArticle(?\DOMElement $page = null)
1369
1368
return $ this ->grabArticle ($ this ->body );
1370
1369
}
1371
1370
1372
- return false ;
1371
+ return null ;
1373
1372
}
1374
1373
1375
1374
return $ articleContent ;
@@ -1379,7 +1378,7 @@ protected function grabArticle(?\DOMElement $page = null)
1379
1378
* Get an element weight by attribute.
1380
1379
* Uses regular expressions to tell if this element looks good or bad.
1381
1380
*/
1382
- protected function weightAttribute (\ DOMElement $ element , string $ attribute ): int
1381
+ protected function weightAttribute (JSLikeHTMLElement $ element , string $ attribute ): int
1383
1382
{
1384
1383
if (!$ element ->hasAttribute ($ attribute )) {
1385
1384
return 0 ;
@@ -1423,7 +1422,7 @@ protected function reinitBody(): void
1423
1422
*
1424
1423
* @param callable(float): float $f
1425
1424
*/
1426
- private static function updateContentScore (\ DOMElement $ element , callable $ f ): void
1425
+ private static function updateContentScore (JSLikeHTMLElement $ element , callable $ f ): void
1427
1426
{
1428
1427
$ readabilityAttr = $ element ->getAttributeNode ('readability ' );
1429
1428
$ prevScore = (float ) $ readabilityAttr ->value ;
@@ -1433,7 +1432,7 @@ private static function updateContentScore(\DOMElement $element, callable $f): v
1433
1432
/**
1434
1433
* Gets the content score for given element.
1435
1434
*/
1436
- private static function getContentScore (\ DOMElement $ element ): float
1435
+ private static function getContentScore (JSLikeHTMLElement $ element ): float
1437
1436
{
1438
1437
return $ element ->hasAttribute ('readability ' ) ? (float ) $ element ->getAttribute ('readability ' ) : 0 ;
1439
1438
}
@@ -1505,11 +1504,11 @@ private function loadHtml(): void
1505
1504
$ this ->dom ->registerNodeClass (\DOMElement::class, JSLikeHTMLElement::class);
1506
1505
}
1507
1506
1508
- private function getAncestors (\ DOMElement $ node , int $ maxDepth = 0 ): array
1507
+ private function getAncestors (JSLikeHTMLElement $ node , int $ maxDepth = 0 ): array
1509
1508
{
1510
1509
$ ancestors = [];
1511
1510
$ i = 0 ;
1512
- while ($ node ->parentNode instanceof \DOMElement ) {
1511
+ while ($ node ->parentNode instanceof JSLikeHTMLElement ) {
1513
1512
$ ancestors [] = $ node ->parentNode ;
1514
1513
if (++$ i === $ maxDepth ) {
1515
1514
break ;
@@ -1537,7 +1536,7 @@ private function isPhrasingContent($node): bool
1537
1536
);
1538
1537
}
1539
1538
1540
- private function hasSingleTagInsideElement (\ DOMElement $ node , string $ tag ): bool
1539
+ private function hasSingleTagInsideElement (JSLikeHTMLElement $ node , string $ tag ): bool
1541
1540
{
1542
1541
if (1 !== $ node ->childNodes ->length || $ node ->childNodes ->item (0 )->nodeName !== $ tag ) {
1543
1542
return false ;
@@ -1557,7 +1556,7 @@ private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
1557
1556
* Tidy must be configured to not clean the input for this function to
1558
1557
* work as expected, see $this->tidy_config['clean']
1559
1558
*/
1560
- private function isNodeVisible (\ DOMElement $ node ): bool
1559
+ private function isNodeVisible (JSLikeHTMLElement $ node ): bool
1561
1560
{
1562
1561
return !(
1563
1562
$ node ->hasAttribute ('style ' )
0 commit comments