Skip to content

Commit 1bc8514

Browse files
authored
Merge pull request #47 from Euak/ngrams
Ngrams improvements
2 parents a446020 + 453f3c9 commit 1bc8514

File tree

6 files changed

+754
-13
lines changed

6 files changed

+754
-13
lines changed

src/NGrams/NGramFactory.php

Lines changed: 96 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@ class NGramFactory
1111
{
1212
const BIGRAM = 2;
1313
const TRIGRAM = 3;
14-
14+
1515
/**
16-
* Protect the constructor
17-
*/
16+
* Protect the constructor
17+
*/
1818
protected function __construct(){}
19-
19+
2020
/**
21-
* Generate Ngrams from the tokens
22-
* @param array $tokens
23-
* @param int $nGramSize
24-
* @return array return an array of the ngrams
25-
*/
21+
* Generate Ngrams from the tokens
22+
* @param array $tokens
23+
* @param int $nGramSize
24+
* @return array return an array of the ngrams
25+
*/
2626
static public function create(array $tokens, $nGramSize = self::BIGRAM, $separator = ' ') : array
2727
{
2828
$separatorLength = strlen($separator);
@@ -31,17 +31,100 @@ static public function create(array $tokens, $nGramSize = self::BIGRAM, $separat
3131
return [];
3232
}
3333
$ngrams = array_fill(0, $length, ''); // initialize the array
34-
34+
3535
for($index = 0; $index < $length; $index++)
3636
{
3737
for($jindex = 0; $jindex < $nGramSize; $jindex++)
3838
{
39-
$ngrams[$index] .= $tokens[$index + $jindex];
40-
if($jindex < $nGramSize - $separatorLength) {
39+
$ngrams[$index] .= $tokens[$index + $jindex];
40+
//alterado a condição, pois não considera-se o tamanho do separador e sim a posição do ponteiro em relação ao tamanho do Ngram
41+
if($jindex < $nGramSize - 1) {
4142
$ngrams[$index] .= $separator;
4243
}
4344
}
4445
}
45-
return $ngrams;
46+
return $ngrams;
47+
}
48+
49+
/**
50+
* Set the frenquecies of the ngrams and their respective tokens
51+
* @param array $ngrams
52+
* @param string $sep
53+
* @return array return an array of the ngrams with frequencies
54+
*/
55+
static public function getFreq(array $ngrams, string $sep = ' ') : array
56+
{
57+
//getting the frequencies of the ngrams array and an array with no repetition
58+
$ngramsUnique = array_count_values($ngrams);
59+
60+
//array to be the product of this function
61+
$ngramsFinal = array();
62+
63+
//creates an array of tokens per ngram
64+
$ngramsArray = self::ngramsAsArray($sep, $ngrams);
65+
66+
//interate the array with no repeated ngrams
67+
foreach ($ngramsUnique as $ngramString => $ngramFrequency) {
68+
$ngramsFinal[$ngramString] = array($ngramFrequency); //putting into the final array an array of frequencies (first, the ngram frequency)
69+
70+
$ngramArray = explode($sep, $ngramString); //getting an array of tokens of the ngram
71+
$ngramSize = count($ngramArray); //getting the size of ngram
72+
foreach ($ngramArray as $kToken => $token) { //iterating the array of tokens of the ngram
73+
$ngramsFinal[$ngramString][$kToken+1] = self::countFreq($ngramsArray, $token, $kToken); //getting the frequency of the token
74+
75+
if($ngramSize > 2) {
76+
//getting the combined frequency of the tokens
77+
for ($i = $kToken+1; $i < $ngramSize; $i++) {
78+
$ngramsFinal[$ngramString][$ngramSize+$kToken+$i] = self::countFreq($ngramsArray, $token, $kToken, $ngramArray[$i], $i);
79+
}
80+
}
81+
}
82+
83+
}
84+
85+
return $ngramsFinal;
86+
}
87+
88+
/**
89+
* Count the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
90+
* @param array $ngramsArray
91+
* @param string $str1
92+
* @param int $pos1
93+
* @param string $str2
94+
* @param int $pos2
95+
* @return int $count return the frequency
96+
*/
97+
static private function countFreq(array $ngramsArray, string $str1, int $pos1, string $str2 = null, int $pos2 = null) : int
98+
{
99+
$count = 0;
100+
101+
//counts the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
102+
foreach ($ngramsArray as $ngramArray) {
103+
if($str1 === $ngramArray[$pos1]) {
104+
if(isset($str2) && isset($pos2)) {
105+
if($str2 === $ngramArray[$pos2]) {
106+
$count++;
107+
}
108+
} else {
109+
$count++;
110+
}
111+
}
112+
}
113+
114+
return $count;
115+
}
116+
117+
/**
118+
* Transform the ngram array to an array of their tokens
119+
* @param string $sep
120+
* @param array $ngrams
121+
* @return array $ngramsArray
122+
*/
123+
static private function ngramsAsArray(string $sep, array $ngrams) : array {
124+
$ngramsArray = array();
125+
foreach($ngrams as $key => $ngram) {
126+
$ngramsArray[] = explode($sep, $ngram);
127+
}
128+
return $ngramsArray;
46129
}
47130
}

0 commit comments

Comments
 (0)