Skip to content

Commit d033e77

Browse files
authored
Merge pull request #22 from yooper/add_text_corpus
Add text corpus, lexical diversity algorithms and helper methods
2 parents eff075c + f2b4453 commit d033e77

File tree

17 files changed

+409
-11
lines changed

17 files changed

+409
-11
lines changed

composer.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"psr-4": {
1414
"TextAnalysis\\": "src/"
1515
},
16-
"files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php"]
16+
"files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php", "src/helpers/helpers.php"]
1717
},
1818
"autoload-dev": {
1919
"files": ["tests/TestBaseCase.php"]
@@ -24,7 +24,7 @@
2424
}
2525
},
2626
"require" : {
27-
"php": ">=5.5",
27+
"php": ">=7.0",
2828
"yooper/stop-words": "^1.0",
2929
"symfony/console": ">=2.7",
3030
"camspiers/porter-stemmer": "1.0.*",
@@ -33,6 +33,6 @@
3333
},
3434
"require-dev": {
3535
"phpunit/phpunit": "5.*",
36-
"mockery/mockery" : "0.9.7"
36+
"mockery/mockery" : "1.0"
3737
}
3838
}

src/Collocations/CollocationFinder.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ public function __construct(array $tokens, $nGramSize = 2)
2727
*/
2828
public function getCollocations()
2929
{
30-
$nGramTokens = NGramFactory::create($this->tokens, $this->nGramSize);
31-
return (new FreqDist($nGramTokens))->getKeyValuesByFrequency();
30+
$nGramTokens = ngrams($this->tokens, $this->nGramSize);
31+
return freq_dist($nGramTokens)->getKeyValuesByFrequency();
3232
}
3333

3434
/**
@@ -37,8 +37,8 @@ public function getCollocations()
3737
*/
3838
public function getCollocationsByPmi()
3939
{
40-
$nGramFreqDist = new FreqDist(NGramFactory::create($this->tokens, $this->nGramSize));
41-
$unigramsFreqDist = new FreqDist($this->tokens);
40+
$nGramFreqDist = freq_dist(ngrams($this->tokens, $this->nGramSize));
41+
$unigramsFreqDist = freq_dist($this->tokens);
4242

4343
$dataSet = [];
4444
foreach($nGramFreqDist->getKeys() as $nGramToken)

src/Comparisons/LongestCommonSubstringComparison.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
class LongestCommonSubstringComparison implements ISimilarity, IDistance
1414
{
1515
/**
16-
* Using caching to improve preformance on text2 inputs
16+
* Using caching to improve performance on text2 inputs
1717
* @var boolean
1818
*/
1919
protected $useCache = false;

src/Corpus/TextCorpus.php

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
<?php
2+
3+
namespace TextAnalysis\Corpus;
4+
5+
use TextAnalysis\Tokenizers\GeneralTokenizer;
6+
use TextAnalysis\LexicalDiversity\Naive;
7+
8+
/**
9+
* Explore the text corpus
10+
* @author yooper
11+
*/
12+
class TextCorpus
13+
{
14+
/**
15+
*
16+
* @var string
17+
*/
18+
protected $text;
19+
20+
/**
21+
*
22+
* @var array
23+
*/
24+
protected $tokens = [];
25+
26+
public function __construct(string $text)
27+
{
28+
$this->text = $text;
29+
}
30+
31+
/**
32+
* Returns the original text
33+
* @return string
34+
*/
35+
public function getText() : string
36+
{
37+
return $this->text;
38+
}
39+
40+
public function getTokens(string $tokenizerClassName = GeneralTokenizer::class) : array
41+
{
42+
if(empty($this->tokens)) {
43+
$this->tokens = tokenize($this->getText(), $tokenizerClassName);
44+
}
45+
return $this->tokens;
46+
}
47+
48+
/**
49+
* Return a list of positions that the needs were found in the text
50+
* @param array $needles
51+
* @return int[]
52+
*/
53+
public function getDispersion(array $needles) : array
54+
{
55+
$found = array_fill_keys($needles, []);
56+
foreach(array_keys($needles) as $needle)
57+
{
58+
$found[$needle] = $this->findAll($needle);
59+
}
60+
return $found;
61+
}
62+
63+
/**
64+
* Compute the lexical diversity, the default uses a naive algorithm
65+
* @param string $lexicalDiversityClassName
66+
* @return float
67+
*/
68+
public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::class) : float
69+
{
70+
return lexical_diversity($this->getTokens(), $lexicalDiversityClassName);
71+
}
72+
73+
/**
74+
* See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
75+
* @param string $needle
76+
* @param int $spacing The amount of space left and right of the found needle
77+
* @return array
78+
*/
79+
public function concordance(string $needle, int $spacing = 20) : array
80+
{
81+
$position = 0;
82+
$found = [];
83+
$text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text));
84+
$needleLength = strlen($needle);
85+
$textLength = strlen($text);
86+
$bufferLength = $needleLength + 2 * $spacing;
87+
88+
while (($position = stripos($text, $needle, $position))!== false)
89+
{
90+
$left = max($position - $spacing, 0);
91+
if($needleLength + $spacing + $position > $textLength) {
92+
$tmp = substr($text, $left);
93+
} else {
94+
$tmp = substr($text, $left, $bufferLength);
95+
}
96+
$found[] = $tmp;
97+
$position += $needleLength;
98+
}
99+
return $found;
100+
}
101+
102+
/**
103+
* Get percentage of times the needle shows up in the text
104+
* @param string $needle
105+
* @return float
106+
*/
107+
public function percentage(string $needle) : float
108+
{
109+
$freqDist = freq_dist($this->getTokens());
110+
return $freqDist->getKeyValuesByFrequency()[$needle] / $freqDist->getTotalTokens();
111+
}
112+
113+
/**
114+
* Performs a case insensitive search for the needle
115+
* @param string $needle
116+
* @return int
117+
*/
118+
public function count(string $needle) : int
119+
{
120+
return substr_count(strtolower($this->getText()), strtolower($needle));
121+
}
122+
123+
/**
124+
* Return all the position of the needle found in the text
125+
* @param string $needle
126+
* @return array
127+
*/
128+
public function findAll(string $needle) : array
129+
{
130+
$lastPos = 0;
131+
$positions = [];
132+
$needle = strtolower($needle);
133+
$text = strtolower($this->getText());
134+
$needleLength = strlen($needle);
135+
while (($lastPos = stripos($text, $needle, $lastPos))!== false)
136+
{
137+
$positions[] = $lastPos;
138+
$lastPos += $needleLength;
139+
}
140+
return $positions;
141+
}
142+
public function toString()
143+
{
144+
return $this->text;
145+
}
146+
147+
public function __destruct()
148+
{
149+
unset($this->text);
150+
unset($this->tokens);
151+
}
152+
}

src/Indexes/TfIdf.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public function __construct(ICollection $collection)
3737
protected function buildIndex(ICollection $collection)
3838
{
3939
foreach($collection as $id => $document){
40-
$freqDist = new FreqDist($document->getDocumentData());
40+
$freqDist = freq_dist($document->getDocumentData());
4141
foreach($freqDist->getKeyValuesByFrequency() as $key => $freq) {
4242
if(!isset($this->idf[$key])) {
4343
$this->idf[$key] = 0;

src/Interfaces/ILexicalDiversity.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?php
2+
3+
namespace TextAnalysis\Interfaces;
4+
5+
/**
6+
* Lexical Diversity Interface
7+
* @author developer
8+
*/
9+
interface ILexicalDiversity
10+
{
11+
public function getDiversity(array $tokens) : float;
12+
}

src/LexicalDiversity/Naive.php

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace TextAnalysis\LexicalDiversity;
4+
5+
/**
6+
* A very simple algorithm for measuring lexical diversity;
7+
*
8+
* @author yooper
9+
*/
10+
class Naive implements \TextAnalysis\Interfaces\ILexicalDiversity
11+
{
12+
public function getDiversity(array $tokens) : float
13+
{
14+
15+
return count(array_unique( $tokens )) / array_sum( array_map( 'strlen', $tokens) );
16+
}
17+
}

src/LexicalDiversity/YuleI.php

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<?php
2+
3+
namespace TextAnalysis\LexicalDiversity;
4+
5+
/**
6+
* Implementation of Yule's I algorithm
7+
* @author yooper
8+
*/
9+
class YuleI
10+
{
11+
public function getDiversity(array $tokens) : float
12+
{
13+
$freq = array_count_values($tokens);
14+
$m1 = array_sum( array_values( $freq));
15+
$m2 = array_sum( array_map( function($value){ return $value ** 2; }, array_values($freq)));
16+
return ($m1*$m1) / ($m2-$m1);
17+
}
18+
}

src/LexicalDiversity/YuleK.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
namespace TextAnalysis\LexicalDiversity;
4+
5+
/**
6+
* Description of YuleK
7+
*
8+
* @author yooper
9+
*/
10+
class YuleK extends YuleI implements \TextAnalysis\Interfaces\ILexicalDiversity
11+
{
12+
public function getDiversity(array $tokens): float
13+
{
14+
return 1 / parent::getDiversity($tokens) * 10000;
15+
}
16+
}

src/NGrams/NGramFactory.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ protected function __construct(){}
2323
* @param int $nGramSize
2424
* @return array return an array of the ngrams
2525
*/
26-
static public function create(array $tokens, $nGramSize = self::BIGRAM, $separator = ' ')
26+
static public function create(array $tokens, $nGramSize = self::BIGRAM, $separator = ' ') : array
2727
{
2828
$separatorLength = strlen($separator);
2929
$length = count($tokens) - $nGramSize + 1;

0 commit comments

Comments
 (0)