Automated build for v0.01
This commit is contained in:
374
vendor/andreskrey/Readability/Configuration.php
vendored
Normal file
374
vendor/andreskrey/Readability/Configuration.php
vendored
Normal file
@ -0,0 +1,374 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability;
|
||||
|
||||
use Psr\Log\LoggerAwareTrait;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Psr\Log\NullLogger;
|
||||
|
||||
/**
|
||||
* Class Configuration.
|
||||
*/
|
||||
class Configuration
|
||||
{
|
||||
use LoggerAwareTrait;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $maxTopCandidates = 5;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $charThreshold = 500;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $articleByLine = false;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $stripUnlikelyCandidates = true;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $cleanConditionally = true;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $weightClasses = true;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $fixRelativeURLs = false;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $substituteEntities = false;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $normalizeEntities = false;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
protected $summonCthulhu = false;
|
||||
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected $originalURL = 'http://fakehost';
|
||||
|
||||
/**
|
||||
* Configuration constructor.
|
||||
*
|
||||
* @param array $params
|
||||
*/
|
||||
public function __construct(array $params = [])
|
||||
{
|
||||
foreach ($params as $key => $value) {
|
||||
$setter = sprintf('set%s', $key);
|
||||
if (method_exists($this, $setter)) {
|
||||
call_user_func([$this, $setter], $value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array-representation of configuration.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function toArray()
|
||||
{
|
||||
$out = [];
|
||||
foreach ($this as $key => $value) {
|
||||
$getter = sprintf('get%s', $key);
|
||||
if (!is_object($value) && method_exists($this, $getter)) {
|
||||
$out[$key] = call_user_func([$this, $getter]);
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return LoggerInterface
|
||||
*/
|
||||
public function getLogger()
|
||||
{
|
||||
// If no logger has been set, just return a null logger
|
||||
if ($this->logger === null) {
|
||||
return new NullLogger();
|
||||
}
|
||||
|
||||
return $this->logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param LoggerInterface $logger
|
||||
*
|
||||
* @return Configuration
|
||||
*/
|
||||
public function setLogger(LoggerInterface $logger)
|
||||
{
|
||||
$this->logger = $logger;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getMaxTopCandidates()
|
||||
{
|
||||
return $this->maxTopCandidates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $maxTopCandidates
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setMaxTopCandidates($maxTopCandidates)
|
||||
{
|
||||
$this->maxTopCandidates = $maxTopCandidates;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getCharThreshold()
|
||||
{
|
||||
return $this->charThreshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $charThreshold
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setCharThreshold($charThreshold)
|
||||
{
|
||||
$this->charThreshold = $charThreshold;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getCharThreshold. Will be removed in version 2.0
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function getWordThreshold()
|
||||
{
|
||||
@trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
|
||||
|
||||
return $this->charThreshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $charThreshold
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setWordThreshold($charThreshold)
|
||||
{
|
||||
@trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
|
||||
|
||||
$this->charThreshold = $charThreshold;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getArticleByLine()
|
||||
{
|
||||
return $this->articleByLine;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $articleByLine
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setArticleByLine($articleByLine)
|
||||
{
|
||||
$this->articleByLine = $articleByLine;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getStripUnlikelyCandidates()
|
||||
{
|
||||
return $this->stripUnlikelyCandidates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $stripUnlikelyCandidates
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setStripUnlikelyCandidates($stripUnlikelyCandidates)
|
||||
{
|
||||
$this->stripUnlikelyCandidates = $stripUnlikelyCandidates;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getCleanConditionally()
|
||||
{
|
||||
return $this->cleanConditionally;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $cleanConditionally
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setCleanConditionally($cleanConditionally)
|
||||
{
|
||||
$this->cleanConditionally = $cleanConditionally;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getWeightClasses()
|
||||
{
|
||||
return $this->weightClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $weightClasses
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setWeightClasses($weightClasses)
|
||||
{
|
||||
$this->weightClasses = $weightClasses;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getFixRelativeURLs()
|
||||
{
|
||||
return $this->fixRelativeURLs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $fixRelativeURLs
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setFixRelativeURLs($fixRelativeURLs)
|
||||
{
|
||||
$this->fixRelativeURLs = $fixRelativeURLs;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getSubstituteEntities()
|
||||
{
|
||||
return $this->substituteEntities;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $substituteEntities
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setSubstituteEntities($substituteEntities)
|
||||
{
|
||||
$this->substituteEntities = $substituteEntities;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getNormalizeEntities()
|
||||
{
|
||||
return $this->normalizeEntities;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $normalizeEntities
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setNormalizeEntities($normalizeEntities)
|
||||
{
|
||||
$this->normalizeEntities = $normalizeEntities;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function getOriginalURL()
|
||||
{
|
||||
return $this->originalURL;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $originalURL
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setOriginalURL($originalURL)
|
||||
{
|
||||
$this->originalURL = $originalURL;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function getSummonCthulhu()
|
||||
{
|
||||
return $this->summonCthulhu;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $summonCthulhu
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setSummonCthulhu($summonCthulhu)
|
||||
{
|
||||
$this->summonCthulhu = $summonCthulhu;
|
||||
|
||||
return $this;
|
||||
}
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMAttr.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMAttr.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMAttr extends \DOMAttr
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMCdataSection.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMCdataSection.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMCdataSection extends \DOMCdataSection
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMCharacterData.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMCharacterData.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMCharacterData extends \DOMCharacterData
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMComment.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMComment.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMComment extends \DOMComment
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
30
vendor/andreskrey/Readability/Nodes/DOM/DOMDocument.php
vendored
Normal file
30
vendor/andreskrey/Readability/Nodes/DOM/DOMDocument.php
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMDocument extends \DOMDocument
|
||||
{
|
||||
use NodeTrait;
|
||||
|
||||
public function __construct($version, $encoding)
|
||||
{
|
||||
parent::__construct($version, $encoding);
|
||||
|
||||
$this->registerNodeClass('DOMAttr', DOMAttr::class);
|
||||
$this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
|
||||
$this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
|
||||
$this->registerNodeClass('DOMComment', DOMComment::class);
|
||||
$this->registerNodeClass('DOMDocument', self::class);
|
||||
$this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
|
||||
$this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
|
||||
$this->registerNodeClass('DOMElement', DOMElement::class);
|
||||
$this->registerNodeClass('DOMEntity', DOMEntity::class);
|
||||
$this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
|
||||
$this->registerNodeClass('DOMNode', DOMNode::class);
|
||||
$this->registerNodeClass('DOMNotation', DOMNotation::class);
|
||||
$this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
|
||||
$this->registerNodeClass('DOMText', DOMText::class);
|
||||
}
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMDocumentFragment.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMDocumentFragment.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMDocumentFragment extends \DOMDocumentFragment
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMDocumentType.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMDocumentType.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMDocumentType extends \DOMDocumentType
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMElement.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMElement.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMElement extends \DOMElement
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMEntity.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMEntity.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMEntity extends \DOMEntity
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMEntityReference.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMEntityReference.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMEntityReference extends \DOMEntityReference
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
14
vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php
vendored
Normal file
14
vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
/**
|
||||
* @method getAttribute($attribute)
|
||||
* @method hasAttribute($attribute)
|
||||
*/
|
||||
class DOMNode extends \DOMNode
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMNotation.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMNotation.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMNotation extends \DOMNotation
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMProcessingInstruction.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMProcessingInstruction.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMProcessingInstruction extends \DOMProcessingInstruction
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
10
vendor/andreskrey/Readability/Nodes/DOM/DOMText.php
vendored
Normal file
10
vendor/andreskrey/Readability/Nodes/DOM/DOMText.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes\DOM;
|
||||
|
||||
use andreskrey\Readability\Nodes\NodeTrait;
|
||||
|
||||
class DOMText extends \DOMText
|
||||
{
|
||||
use NodeTrait;
|
||||
}
|
560
vendor/andreskrey/Readability/Nodes/NodeTrait.php
vendored
Normal file
560
vendor/andreskrey/Readability/Nodes/NodeTrait.php
vendored
Normal file
@ -0,0 +1,560 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes;
|
||||
|
||||
use andreskrey\Readability\Nodes\DOM\DOMDocument;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMElement;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMNode;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMText;
|
||||
use DOMNodeList;
|
||||
|
||||
/**
|
||||
* @method \DOMNode removeAttribute($name)
|
||||
*/
|
||||
trait NodeTrait
|
||||
{
|
||||
/**
|
||||
* Content score of the node. Used to determine the value of the content.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
public $contentScore = 0;
|
||||
|
||||
/**
|
||||
* Flag for initialized status.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $initialized = false;
|
||||
|
||||
/**
|
||||
* Flag data tables.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $readabilityDataTable = false;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $divToPElements = [
|
||||
'a',
|
||||
'blockquote',
|
||||
'dl',
|
||||
'div',
|
||||
'img',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
'select',
|
||||
];
|
||||
|
||||
/**
|
||||
* The commented out elements qualify as phrasing content but tend to be
|
||||
* removed by readability when put into paragraphs, so we ignore them here.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $phrasing_elems = [
|
||||
// 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
|
||||
'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
|
||||
'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
|
||||
'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q',
|
||||
'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub',
|
||||
'sup', 'textarea', 'time', 'var', 'wbr'
|
||||
];
|
||||
|
||||
/**
|
||||
* initialized getter.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isInitialized()
|
||||
{
|
||||
return $this->initialized;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isReadabilityDataTable()
|
||||
{
|
||||
/*
|
||||
* This is a workaround that I'd like to remove in the future.
|
||||
* Seems that although we are extending the base DOMElement and adding custom properties (like this one,
|
||||
* 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName.
|
||||
* This means that even if we mark the tables in a previous step, when we want to retrieve that information,
|
||||
* all the custom properties are in their default values. Somehow we need to find a way to make these properties
|
||||
* permanent across the whole DOM.
|
||||
*
|
||||
* @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names
|
||||
*/
|
||||
return $this->hasAttribute('readabilityDataTable')
|
||||
&& $this->getAttribute('readabilityDataTable') === '1';
|
||||
// return $this->readabilityDataTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bool $param
|
||||
*/
|
||||
public function setReadabilityDataTable($param)
|
||||
{
|
||||
// Can't be "true" because DOMDocument casts it to "1"
|
||||
$this->setAttribute('readabilityDataTable', $param ? '1' : '0');
|
||||
// $this->readabilityDataTable = $param;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializer. Calculates the current score of the node and returns a full Readability object.
|
||||
*
|
||||
* @ TODO: I don't like the weightClasses param. How can we get the config here?
|
||||
*
|
||||
* @param $weightClasses bool Weight classes?
|
||||
*
|
||||
* @return static
|
||||
*/
|
||||
public function initializeNode($weightClasses)
|
||||
{
|
||||
if (!$this->isInitialized()) {
|
||||
$contentScore = 0;
|
||||
|
||||
switch ($this->nodeName) {
|
||||
case 'div':
|
||||
$contentScore += 5;
|
||||
break;
|
||||
|
||||
case 'pre':
|
||||
case 'td':
|
||||
case 'blockquote':
|
||||
$contentScore += 3;
|
||||
break;
|
||||
|
||||
case 'address':
|
||||
case 'ol':
|
||||
case 'ul':
|
||||
case 'dl':
|
||||
case 'dd':
|
||||
case 'dt':
|
||||
case 'li':
|
||||
case 'form':
|
||||
$contentScore -= 3;
|
||||
break;
|
||||
|
||||
case 'h1':
|
||||
case 'h2':
|
||||
case 'h3':
|
||||
case 'h4':
|
||||
case 'h5':
|
||||
case 'h6':
|
||||
case 'th':
|
||||
$contentScore -= 5;
|
||||
break;
|
||||
}
|
||||
|
||||
$this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
|
||||
|
||||
$this->initialized = true;
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
|
||||
* to check first the existence of the attributes property.
|
||||
*
|
||||
* @param $attributeName string Attribute to retrieve
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getAttribute($attributeName)
|
||||
{
|
||||
if (!is_null($this->attributes)) {
|
||||
return parent::getAttribute($attributeName);
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Override for native hasAttribute.
|
||||
*
|
||||
* @see getAttribute
|
||||
*
|
||||
* @param $attributeName
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasAttribute($attributeName)
|
||||
{
|
||||
if (!is_null($this->attributes)) {
|
||||
return parent::hasAttribute($attributeName);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ancestors of the current node.
|
||||
*
|
||||
* @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getNodeAncestors($maxLevel = 3)
|
||||
{
|
||||
$ancestors = [];
|
||||
$level = 0;
|
||||
|
||||
$node = $this->parentNode;
|
||||
|
||||
while ($node && !($node instanceof DOMDocument)) {
|
||||
$ancestors[] = $node;
|
||||
$level++;
|
||||
if ($level === $maxLevel) {
|
||||
break;
|
||||
}
|
||||
$node = $node->parentNode;
|
||||
}
|
||||
|
||||
return $ancestors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all links from the current element.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getAllLinks()
|
||||
{
|
||||
return iterator_to_array($this->getElementsByTagName('a'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the density of links as a percentage of the content
|
||||
* This is the amount of text that is inside a link divided by the total text in the node.
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function getLinkDensity()
|
||||
{
|
||||
$linkLength = 0;
|
||||
$textLength = mb_strlen($this->getTextContent(true));
|
||||
|
||||
if (!$textLength) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
$links = $this->getAllLinks();
|
||||
|
||||
if ($links) {
|
||||
/** @var DOMElement $link */
|
||||
foreach ($links as $link) {
|
||||
$linkLength += mb_strlen($link->getTextContent(true));
|
||||
}
|
||||
}
|
||||
|
||||
return $linkLength / $textLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the weight of the class/id of the current element.
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function getClassWeight()
|
||||
{
|
||||
$weight = 0;
|
||||
|
||||
// Look for a special classname
|
||||
$class = $this->getAttribute('class');
|
||||
if (trim($class)) {
|
||||
if (preg_match(NodeUtility::$regexps['negative'], $class)) {
|
||||
$weight -= 25;
|
||||
}
|
||||
|
||||
if (preg_match(NodeUtility::$regexps['positive'], $class)) {
|
||||
$weight += 25;
|
||||
}
|
||||
}
|
||||
|
||||
// Look for a special ID
|
||||
$id = $this->getAttribute('id');
|
||||
if (trim($id)) {
|
||||
if (preg_match(NodeUtility::$regexps['negative'], $id)) {
|
||||
$weight -= 25;
|
||||
}
|
||||
|
||||
if (preg_match(NodeUtility::$regexps['positive'], $id)) {
|
||||
$weight += 25;
|
||||
}
|
||||
}
|
||||
|
||||
return $weight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the full text of the node.
|
||||
*
|
||||
* @param bool $normalize Normalize white space?
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getTextContent($normalize = false)
|
||||
{
|
||||
$nodeValue = $this->nodeValue;
|
||||
if ($normalize) {
|
||||
$nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
|
||||
}
|
||||
|
||||
return $nodeValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the children of the current node.
|
||||
*
|
||||
* @param bool $filterEmptyDOMText Filter empty DOMText nodes?
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getChildren($filterEmptyDOMText = false)
|
||||
{
|
||||
$ret = iterator_to_array($this->childNodes);
|
||||
if ($filterEmptyDOMText) {
|
||||
// Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
|
||||
$ret = array_values(array_filter($ret, function ($node) {
|
||||
return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
|
||||
}));
|
||||
}
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array indicating how many rows and columns this table has.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getRowAndColumnCount()
|
||||
{
|
||||
$rows = $columns = 0;
|
||||
$trs = $this->getElementsByTagName('tr');
|
||||
foreach ($trs as $tr) {
|
||||
/** @var \DOMElement $tr */
|
||||
$rowspan = $tr->getAttribute('rowspan');
|
||||
$rows += ($rowspan || 1);
|
||||
|
||||
// Now look for column-related info
|
||||
$columnsInThisRow = 0;
|
||||
$cells = $tr->getElementsByTagName('td');
|
||||
foreach ($cells as $cell) {
|
||||
/** @var \DOMElement $cell */
|
||||
$colspan = $cell->getAttribute('colspan');
|
||||
$columnsInThisRow += ($colspan || 1);
|
||||
}
|
||||
$columns = max($columns, $columnsInThisRow);
|
||||
}
|
||||
|
||||
return ['rows' => $rows, 'columns' => $columns];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new node based on the text content of the original node.
|
||||
*
|
||||
* @param $originalNode DOMNode
|
||||
* @param $tagName string
|
||||
*
|
||||
* @return DOMElement
|
||||
*/
|
||||
public function createNode($originalNode, $tagName)
|
||||
{
|
||||
$text = $originalNode->getTextContent();
|
||||
$newNode = $originalNode->ownerDocument->createElement($tagName, $text);
|
||||
|
||||
return $newNode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a given node has one of its ancestor tag name matching the
|
||||
* provided one.
|
||||
*
|
||||
* @param string $tagName
|
||||
* @param int $maxDepth
|
||||
* @param callable $filterFn
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null)
|
||||
{
|
||||
$depth = 0;
|
||||
$node = $this;
|
||||
|
||||
while ($node->parentNode) {
|
||||
if ($maxDepth > 0 && $depth > $maxDepth) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$node = $node->parentNode;
|
||||
$depth++;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if this node has only whitespace and a single element with given tag
|
||||
* or if it contains no element with given tag or more than 1 element.
|
||||
*
|
||||
* @param $tag string Name of tag
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasSingleTagInsideElement($tag)
|
||||
{
|
||||
// There should be exactly 1 element child with given tag
|
||||
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
return array_reduce($children, function ($carry, $child) {
|
||||
if (!$carry === false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* @var DOMNode $child */
|
||||
return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent()));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the current element has a single child block element.
|
||||
* Block elements are the ones defined in the divToPElements array.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasSingleChildBlockElement()
|
||||
{
|
||||
$result = false;
|
||||
if ($this->hasChildNodes()) {
|
||||
foreach ($this->getChildren() as $child) {
|
||||
if (in_array($child->nodeName, $this->divToPElements)) {
|
||||
$result = true;
|
||||
} else {
|
||||
// If any of the hasSingleChildBlockElement calls return true, return true then.
|
||||
/** @var $child DOMElement */
|
||||
$result = ($result || $child->hasSingleChildBlockElement());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isElementWithoutContent()
|
||||
{
|
||||
return $this instanceof DOMElement &&
|
||||
mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
|
||||
($this->childNodes->length === 0 ||
|
||||
$this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
|
||||
/*
|
||||
* Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
|
||||
* If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
|
||||
* HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
|
||||
* DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
|
||||
* are dealing with (And at this point we know they are empty or are just whitespace, because of the
|
||||
* mb_strlen in this chain of checks).
|
||||
*/
|
||||
+ count(array_filter(iterator_to_array($this->childNodes), function ($child) {
|
||||
return $child instanceof DOMText;
|
||||
}))
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a node qualifies as phrasing content.
|
||||
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isPhrasingContent()
|
||||
{
|
||||
return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
|
||||
(!is_null($this->childNodes) &&
|
||||
($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
|
||||
array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
|
||||
return $node->isPhrasingContent() && $carry;
|
||||
}, true)
|
||||
);
|
||||
}
|
||||
|
||||
public function isProbablyVisible()
|
||||
{
|
||||
/*
|
||||
* In the original JS project they check if the node has the style display=none, which unfortunately
|
||||
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
|
||||
*
|
||||
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
|
||||
*/
|
||||
|
||||
return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
|
||||
}
|
||||
|
||||
public function isWhitespace()
|
||||
{
|
||||
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
|
||||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
|
||||
*
|
||||
* In the JS version of getElementsByTagName, if you remove a node it will not appear during the
|
||||
* foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
|
||||
* orphan node and will give an exception if you try to do anything with it.
|
||||
*
|
||||
* Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
|
||||
* removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
|
||||
* never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
|
||||
* to access node 6)
|
||||
*
|
||||
* This function solves this by searching for the nodes on every loop and keeping track of the count differences.
|
||||
* Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
|
||||
* used only when the results of the search are going to be used to remove the nodes.
|
||||
*
|
||||
* @param string $tag
|
||||
*
|
||||
* @return \Generator
|
||||
*/
|
||||
public function shiftingAwareGetElementsByTagName($tag)
|
||||
{
|
||||
/** @var $nodes DOMNodeList */
|
||||
$nodes = $this->getElementsByTagName($tag);
|
||||
$count = $nodes->length;
|
||||
|
||||
for ($i = 0; $i < $count; $i = max(++$i, 0)) {
|
||||
yield $nodes->item($i);
|
||||
|
||||
// Search for all the nodes again
|
||||
$nodes = $this->getElementsByTagName($tag);
|
||||
|
||||
// Subtract the amount of nodes removed from the current index
|
||||
$i -= $count - $nodes->length;
|
||||
|
||||
// Subtract the amount of nodes removed from the current count
|
||||
$count -= ($count - $nodes->length);
|
||||
}
|
||||
}
|
||||
}
|
160
vendor/andreskrey/Readability/Nodes/NodeUtility.php
vendored
Normal file
160
vendor/andreskrey/Readability/Nodes/NodeUtility.php
vendored
Normal file
@ -0,0 +1,160 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability\Nodes;
|
||||
|
||||
use andreskrey\Readability\Nodes\DOM\DOMDocument;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMElement;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMNode;
|
||||
|
||||
/**
|
||||
* Class NodeUtility.
|
||||
*/
|
||||
class NodeUtility
|
||||
{
|
||||
/**
|
||||
* Collection of regexps to check the node usability.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $regexps = [
|
||||
'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
|
||||
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
|
||||
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
|
||||
'byline' => '/byline|author|dateline|writtenby|p-author/i',
|
||||
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
|
||||
'normalize' => '/\s{2,}/',
|
||||
'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
|
||||
'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
|
||||
'prevLink' => '/(prev|earl|old|new|<|«)/i',
|
||||
'whitespace' => '/^\s*$/',
|
||||
'hasContent' => '/\S$/',
|
||||
'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
|
||||
'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
|
||||
// \x{00A0} is the unicode version of
|
||||
'onlyWhitespace' => '/\x{00A0}|\s+/u'
|
||||
];
|
||||
|
||||
/**
|
||||
* Imported from the Element class on league\html-to-markdown.
|
||||
*
|
||||
* @param $node
|
||||
*
|
||||
* @return DOMElement
|
||||
*/
|
||||
public static function nextElement($node)
|
||||
{
|
||||
$next = $node;
|
||||
while ($next
|
||||
&& $next->nodeType !== XML_ELEMENT_NODE
|
||||
&& $next->isWhitespace()) {
|
||||
$next = $next->nextSibling;
|
||||
}
|
||||
|
||||
return $next;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
|
||||
* element with the new tag name and importing it to the main DOMDocument.
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @param string $value
|
||||
* @param bool $importAttributes
|
||||
*
|
||||
* @return DOMNode
|
||||
*/
|
||||
public static function setNodeTag($node, $value, $importAttributes = true)
|
||||
{
|
||||
$new = new DOMDocument('1.0', 'utf-8');
|
||||
$new->appendChild($new->createElement($value));
|
||||
|
||||
$children = $node->childNodes;
|
||||
/** @var $children \DOMNodeList $i */
|
||||
for ($i = 0; $i < $children->length; $i++) {
|
||||
$import = $new->importNode($children->item($i), true);
|
||||
$new->firstChild->appendChild($import);
|
||||
}
|
||||
|
||||
if ($importAttributes) {
|
||||
// Import attributes from the original node.
|
||||
foreach ($node->attributes as $attribute) {
|
||||
$new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
|
||||
}
|
||||
}
|
||||
|
||||
// The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
|
||||
$import = $node->ownerDocument->importNode($new->firstChild, true);
|
||||
$node->parentNode->replaceChild($import, $node);
|
||||
|
||||
return $import;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the current node and returns the next node to be parsed (child, sibling or parent).
|
||||
*
|
||||
* @param DOMNode $node
|
||||
*
|
||||
* @return DOMNode
|
||||
*/
|
||||
public static function removeAndGetNext($node)
|
||||
{
|
||||
$nextNode = self::getNextNode($node, true);
|
||||
$node->parentNode->removeChild($node);
|
||||
|
||||
return $nextNode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the selected node.
|
||||
*
|
||||
* @param $node DOMElement
|
||||
*
|
||||
* @return void
|
||||
**/
|
||||
public static function removeNode($node)
|
||||
{
|
||||
$parent = $node->parentNode;
|
||||
if ($parent) {
|
||||
$parent->removeChild($node);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
|
||||
* for parents.
|
||||
*
|
||||
* @param DOMNode $originalNode
|
||||
* @param bool $ignoreSelfAndKids
|
||||
*
|
||||
* @return DOMNode
|
||||
*/
|
||||
public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
|
||||
{
|
||||
/*
|
||||
* Traverse the DOM from node to node, starting at the node passed in.
|
||||
* Pass true for the second parameter to indicate this node itself
|
||||
* (and its kids) are going away, and we want the next node over.
|
||||
*
|
||||
* Calling this in a loop will traverse the DOM depth-first.
|
||||
*/
|
||||
|
||||
// First check for kids if those aren't being ignored
|
||||
if (!$ignoreSelfAndKids && $originalNode->firstChild) {
|
||||
return $originalNode->firstChild;
|
||||
}
|
||||
|
||||
// Then for siblings...
|
||||
if ($originalNode->nextSibling) {
|
||||
return $originalNode->nextSibling;
|
||||
}
|
||||
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
do {
|
||||
$originalNode = $originalNode->parentNode;
|
||||
} while ($originalNode && !$originalNode->nextSibling);
|
||||
|
||||
return ($originalNode) ? $originalNode->nextSibling : $originalNode;
|
||||
}
|
||||
}
|
7
vendor/andreskrey/Readability/ParseException.php
vendored
Normal file
7
vendor/andreskrey/Readability/ParseException.php
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
<?php
|
||||
|
||||
namespace andreskrey\Readability;
|
||||
|
||||
class ParseException extends \Exception
|
||||
{
|
||||
}
|
1774
vendor/andreskrey/Readability/Readability.php
vendored
Normal file
1774
vendor/andreskrey/Readability/Readability.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user