2
2
3
3
namespace Readability ;
4
4
5
- use DOMElement ;
6
5
use Masterminds \HTML5 ;
7
6
use Psr \Log \LoggerAwareInterface ;
8
7
use Psr \Log \LoggerInterface ;
@@ -115,7 +114,7 @@ class Readability implements LoggerAwareInterface
115
114
// HACK: replace linebreaks plus br's with p's
116
115
'!(<br[^>]*>[ \r\n\s]*){2,}!i ' => '</p><p> ' ,
117
116
// replace noscripts
118
- //'!</?noscript>!is' => '',
117
+ // '!</?noscript>!is' => '',
119
118
// replace fonts to spans
120
119
'!<(/?)font[^>]*>!is ' => '< \\1span> ' ,
121
120
];
@@ -126,8 +125,8 @@ class Readability implements LoggerAwareInterface
126
125
// replace empty tags that break layouts
127
126
'!<(?:a|div|p|figure)[^>]+/>!is ' => '' ,
128
127
// remove all attributes on text tags
129
- //'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
130
- //single newlines cleanup
128
+ // '!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
129
+ // single newlines cleanup
131
130
"/ \n+/ " => "\n" ,
132
131
// modern web...
133
132
'!<pre[^>]*>\s*<code!is ' => '<pre ' ,
@@ -161,7 +160,7 @@ public function setLogger(LoggerInterface $logger): void
161
160
/**
162
161
* Get article title element.
163
162
*
164
- * @return DOMElement
163
+ * @return \ DOMElement
165
164
*/
166
165
public function getTitle ()
167
166
{
@@ -171,7 +170,7 @@ public function getTitle()
171
170
/**
172
171
* Get article content element.
173
172
*
174
- * @return DOMElement
173
+ * @return \ DOMElement
175
174
*/
176
175
public function getContent ()
177
176
{
@@ -280,7 +279,7 @@ public function init(): bool
280
279
/**
281
280
* Run any post-process modifications to article content as necessary.
282
281
*/
283
- public function postProcessContent (DOMElement $ articleContent ): void
282
+ public function postProcessContent (\ DOMElement $ articleContent ): void
284
283
{
285
284
if ($ this ->convertLinksToFootnotes && !preg_match ('/\bwiki/ ' , $ this ->url )) {
286
285
$ this ->addFootnotes ($ articleContent );
@@ -292,7 +291,7 @@ public function postProcessContent(DOMElement $articleContent): void
292
291
*
293
292
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
294
293
*/
295
- public function addFootnotes (DOMElement $ articleContent ): void
294
+ public function addFootnotes (\ DOMElement $ articleContent ): void
296
295
{
297
296
$ footnotesWrapper = $ this ->dom ->createElement ('footer ' );
298
297
$ footnotesWrapper ->setAttribute ('class ' , 'readability-footnotes ' );
@@ -335,7 +334,7 @@ public function addFootnotes(DOMElement $articleContent): void
335
334
$ articleLink ->setAttribute ('style ' , 'color: inherit; text-decoration: none; ' );
336
335
$ articleLink ->setAttribute ('name ' , 'readabilityLink- ' . $ linkCount );
337
336
$ footnote ->setInnerHtml ('<small><sup><a href="#readabilityLink- ' . $ linkCount . '" title="Jump to Link in Article">^</a></sup></small> ' );
338
- $ footnoteLink ->setInnerHtml (( '' !== $ footnoteLink ->getAttribute ('title ' ) ? $ footnoteLink ->getAttribute ('title ' ) : $ linkText) );
337
+ $ footnoteLink ->setInnerHtml ('' !== $ footnoteLink ->getAttribute ('title ' ) ? $ footnoteLink ->getAttribute ('title ' ) : $ linkText );
339
338
$ footnoteLink ->setAttribute ('name ' , 'readabilityFootnoteLink- ' . $ linkCount );
340
339
$ footnote ->appendChild ($ footnoteLink );
341
340
@@ -356,7 +355,7 @@ public function addFootnotes(DOMElement $articleContent): void
356
355
*/
357
356
public function prepArticle (\DOMNode $ articleContent ): void
358
357
{
359
- if (!$ articleContent instanceof DOMElement) {
358
+ if (!$ articleContent instanceof \ DOMElement) {
360
359
return ;
361
360
}
362
361
@@ -456,9 +455,9 @@ public function prepArticle(\DOMNode $articleContent): void
456
455
* Get the inner text of a node.
457
456
* This also strips out any excess whitespace to be found.
458
457
*
459
- * @param DOMElement $e
460
- * @param bool $normalizeSpaces (default: true)
461
- * @param bool $flattenLines (default: false)
458
+ * @param \ DOMElement $e
459
+ * @param bool $normalizeSpaces (default: true)
460
+ * @param bool $flattenLines (default: false)
462
461
*/
463
462
public function getInnerText ($ e , bool $ normalizeSpaces = true , bool $ flattenLines = false ): string
464
463
{
@@ -482,7 +481,7 @@ public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLine
482
481
/**
483
482
* Remove the style attribute on every $e and under.
484
483
*/
485
- public function cleanStyles (DOMElement $ e ): void
484
+ public function cleanStyles (\ DOMElement $ e ): void
486
485
{
487
486
if (\is_object ($ e )) {
488
487
$ elems = $ e ->getElementsByTagName ('* ' );
@@ -515,7 +514,7 @@ public function getWordCount(string $text): int
515
514
* This is the amount of text that is inside a link divided by the total text in the node.
516
515
* Can exclude external references to differentiate between simple text and menus/infoblocks.
517
516
*/
518
- public function getLinkDensity (DOMElement $ e , bool $ excludeExternal = false ): float
517
+ public function getLinkDensity (\ DOMElement $ e , bool $ excludeExternal = false ): float
519
518
{
520
519
$ links = $ e ->getElementsByTagName ('a ' );
521
520
$ textLength = mb_strlen ($ this ->getInnerText ($ e , true , true ));
@@ -538,7 +537,7 @@ public function getLinkDensity(DOMElement $e, bool $excludeExternal = false): fl
538
537
/**
539
538
* Get an element relative weight.
540
539
*/
541
- public function getWeight (DOMElement $ e ): int
540
+ public function getWeight (\ DOMElement $ e ): int
542
541
{
543
542
if (!$ this ->flagIsActive (self ::FLAG_WEIGHT_ATTRIBUTES )) {
544
543
return 0 ;
@@ -556,7 +555,7 @@ public function getWeight(DOMElement $e): int
556
555
/**
557
556
* Remove extraneous break tags from a node.
558
557
*/
559
- public function killBreaks (DOMElement $ node ): void
558
+ public function killBreaks (\ DOMElement $ node ): void
560
559
{
561
560
$ html = $ node ->getInnerHTML ();
562
561
$ html = preg_replace ($ this ->regexps ['killBreaks ' ], '<br /> ' , $ html );
@@ -569,7 +568,7 @@ public function killBreaks(DOMElement $node): void
569
568
*
570
569
* Updated 2012-09-18 to preserve youtube/vimeo iframes
571
570
*/
572
- public function clean (DOMElement $ e , string $ tag ): void
571
+ public function clean (\ DOMElement $ e , string $ tag ): void
573
572
{
574
573
$ targetList = $ e ->getElementsByTagName ($ tag );
575
574
$ isEmbed = ('audio ' === $ tag || 'video ' === $ tag || 'iframe ' === $ tag || 'object ' === $ tag || 'embed ' === $ tag );
@@ -601,7 +600,7 @@ public function clean(DOMElement $e, string $tag): void
601
600
* "Fishy" is an algorithm based on content length, classnames,
602
601
* link density, number of images & embeds, etc.
603
602
*/
604
- public function cleanConditionally (DOMElement $ e , string $ tag ): void
603
+ public function cleanConditionally (\ DOMElement $ e , string $ tag ): void
605
604
{
606
605
if (!$ this ->flagIsActive (self ::FLAG_CLEAN_CONDITIONALLY )) {
607
606
return ;
@@ -714,7 +713,7 @@ public function cleanConditionally(DOMElement $e, string $tag): void
714
713
/**
715
714
* Clean out spurious headers from an Element. Checks things like classnames and link density.
716
715
*/
717
- public function cleanHeaders (DOMElement $ e ): void
716
+ public function cleanHeaders (\ DOMElement $ e ): void
718
717
{
719
718
for ($ headerIndex = 1 ; $ headerIndex < 3 ; ++$ headerIndex ) {
720
719
$ headers = $ e ->getElementsByTagName ('h ' . $ headerIndex );
@@ -754,7 +753,7 @@ public function removeFlag(int $flag): void
754
753
/**
755
754
* Get the article title as an H1.
756
755
*
757
- * @return DOMElement
756
+ * @return \ DOMElement
758
757
*/
759
758
protected function getArticleTitle ()
760
759
{
@@ -826,7 +825,7 @@ protected function prepDocument(): void
826
825
* Initialize a node with the readability object. Also checks the
827
826
* className/id for special names to add to its score.
828
827
*/
829
- protected function initializeNode (DOMElement $ node ): void
828
+ protected function initializeNode (\ DOMElement $ node ): void
830
829
{
831
830
if (!isset ($ node ->tagName )) {
832
831
return ;
@@ -894,11 +893,11 @@ protected function initializeNode(DOMElement $node): void
894
893
* Using a variety of metrics (content score, classname, element types), find the content that is
895
894
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
896
895
*
897
- * @param DOMElement $page
896
+ * @param \ DOMElement $page
898
897
*
899
- * @return DOMElement|false
898
+ * @return \ DOMElement|false
900
899
*/
901
- protected function grabArticle (DOMElement $ page = null )
900
+ protected function grabArticle (\ DOMElement $ page = null )
902
901
{
903
902
if (!$ page ) {
904
903
$ page = $ this ->dom ;
@@ -1040,7 +1039,7 @@ protected function grabArticle(DOMElement $page = null)
1040
1039
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
1041
1040
$ contentScore += min (floor (mb_strlen ($ innerText ) / self ::SCORE_CHARS_IN_PARAGRAPH ), 3 );
1042
1041
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
1043
- //$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
1042
+ // $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
1044
1043
1045
1044
foreach ($ ancestors as $ level => $ ancestor ) {
1046
1045
if (!$ ancestor ->nodeName || !$ ancestor ->parentNode ) {
@@ -1211,7 +1210,7 @@ protected function grabArticle(DOMElement $page = null)
1211
1210
if (0 === strcasecmp ($ tagName , 'td ' ) || 0 === strcasecmp ($ tagName , 'tr ' )) {
1212
1211
$ up = $ topCandidate ;
1213
1212
1214
- if ($ up ->parentNode instanceof DOMElement) {
1213
+ if ($ up ->parentNode instanceof \ DOMElement) {
1215
1214
$ up = $ up ->parentNode ;
1216
1215
1217
1216
if (0 === strcasecmp ($ up ->tagName , 'table ' )) {
@@ -1292,8 +1291,8 @@ protected function grabArticle(DOMElement $page = null)
1292
1291
1293
1292
// To ensure a node does not interfere with readability styles, remove its classnames & ids.
1294
1293
// Now done via RegExp post_filter.
1295
- //$nodeToAppend->removeAttribute('class');
1296
- //$nodeToAppend->removeAttribute('id');
1294
+ // $nodeToAppend->removeAttribute('class');
1295
+ // $nodeToAppend->removeAttribute('id');
1297
1296
// Append sibling and subtract from our list as appending removes a node.
1298
1297
$ articleContent ->appendChild ($ nodeToAppend );
1299
1298
}
@@ -1340,7 +1339,7 @@ protected function grabArticle(DOMElement $page = null)
1340
1339
* Get an element weight by attribute.
1341
1340
* Uses regular expressions to tell if this element looks good or bad.
1342
1341
*/
1343
- protected function weightAttribute (DOMElement $ element , string $ attribute ): int
1342
+ protected function weightAttribute (\ DOMElement $ element , string $ attribute ): int
1344
1343
{
1345
1344
if (!$ element ->hasAttribute ($ attribute )) {
1346
1345
return 0 ;
@@ -1427,7 +1426,7 @@ private function loadHtml(): void
1427
1426
unset($ tidy );
1428
1427
}
1429
1428
1430
- $ this ->html = mb_convert_encoding (( string ) $ this ->html , ' HTML-ENTITIES ' , ' UTF-8 ' ) ;
1429
+ $ this ->html = ' <meta charset="utf-8"> ' . ( string ) $ this ->html ;
1431
1430
1432
1431
if ('html5lib ' === $ this ->parser || 'html5 ' === $ this ->parser ) {
1433
1432
$ this ->dom = (new HTML5 ())->loadHTML ($ this ->html );
@@ -1443,14 +1442,14 @@ private function loadHtml(): void
1443
1442
libxml_use_internal_errors (false );
1444
1443
}
1445
1444
1446
- $ this ->dom ->registerNodeClass (DOMElement::class, \Readability \JSLikeHTMLElement::class);
1445
+ $ this ->dom ->registerNodeClass (\ DOMElement::class, \Readability \JSLikeHTMLElement::class);
1447
1446
}
1448
1447
1449
- private function getAncestors (DOMElement $ node , int $ maxDepth = 0 ): array
1448
+ private function getAncestors (\ DOMElement $ node , int $ maxDepth = 0 ): array
1450
1449
{
1451
1450
$ ancestors = [];
1452
1451
$ i = 0 ;
1453
- while ($ node ->parentNode instanceof DOMElement) {
1452
+ while ($ node ->parentNode instanceof \ DOMElement) {
1454
1453
$ ancestors [] = $ node ->parentNode ;
1455
1454
if (++$ i === $ maxDepth ) {
1456
1455
break ;
@@ -1470,7 +1469,7 @@ private function isPhrasingContent($node): bool
1470
1469
}, iterator_to_array ($ node ->childNodes )), true ));
1471
1470
}
1472
1471
1473
- private function hasSingleTagInsideElement (DOMElement $ node , string $ tag ): bool
1472
+ private function hasSingleTagInsideElement (\ DOMElement $ node , string $ tag ): bool
1474
1473
{
1475
1474
if (1 !== $ node ->childNodes ->length || $ node ->childNodes ->item (0 )->nodeName !== $ tag ) {
1476
1475
return false ;
@@ -1490,11 +1489,11 @@ private function hasSingleTagInsideElement(DOMElement $node, string $tag): bool
1490
1489
* Tidy must be configured to not clean the input for this function to
1491
1490
* work as expected, see $this->tidy_config['clean']
1492
1491
*/
1493
- private function isNodeVisible (DOMElement $ node ): bool
1492
+ private function isNodeVisible (\ DOMElement $ node ): bool
1494
1493
{
1495
1494
return !($ node ->hasAttribute ('style ' )
1496
1495
&& preg_match ($ this ->regexps ['isNotVisible ' ], $ node ->getAttribute ('style ' ))
1497
- )
1496
+ )
1498
1497
&& !$ node ->hasAttribute ('hidden ' );
1499
1498
}
1500
1499
}
0 commit comments