Skip to content

Commit a0332ed

Browse files
authored
Merge pull request #49 from Euak/ngrams-fix
Get Ngrams frequencies fix
2 parents 1bc8514 + 78f012c commit a0332ed

File tree

1 file changed

+44
-22
lines changed

1 file changed

+44
-22
lines changed

src/NGrams/NGramFactory.php

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,23 @@ static public function getFreq(array $ngrams, string $sep = ' ') : array
6363
//creates an array of tokens per ngram
6464
$ngramsArray = self::ngramsAsArray($sep, $ngrams);
6565

66+
$ngramSize = count($ngramsArray[0]);
67+
68+
$tokens_frequencies = self::readFreq($sep, $ngrams);
69+
$combo_frequencies = self::readCombFreq($sep, $ngrams);
70+
6671
//interate the array with no repeated ngrams
6772
foreach ($ngramsUnique as $ngramString => $ngramFrequency) {
6873
$ngramsFinal[$ngramString] = array($ngramFrequency); //putting into the final array an array of frequencies (first, the ngram frequency)
6974

7075
$ngramArray = explode($sep, $ngramString); //getting an array of tokens of the ngram
71-
$ngramSize = count($ngramArray); //getting the size of ngram
7276
foreach ($ngramArray as $kToken => $token) { //iterating the array of tokens of the ngram
73-
$ngramsFinal[$ngramString][$kToken+1] = self::countFreq($ngramsArray, $token, $kToken); //getting the frequency of the token
77+
$ngramsFinal[$ngramString][$kToken+1] = $tokens_frequencies[$token][$kToken]; //getting the frequency of the token
7478

7579
if($ngramSize > 2) {
7680
//getting the combined frequency of the tokens
7781
for ($i = $kToken+1; $i < $ngramSize; $i++) {
78-
$ngramsFinal[$ngramString][$ngramSize+$kToken+$i] = self::countFreq($ngramsArray, $token, $kToken, $ngramArray[$i], $i);
82+
$ngramsFinal[$ngramString][$ngramSize+$kToken+$i] = $combo_frequencies[$token.$sep.$ngramArray[$i]][$kToken][$i];
7983
}
8084
}
8185
}
@@ -86,32 +90,50 @@ static public function getFreq(array $ngrams, string $sep = ' ') : array
8690
}
8791

8892
/**
89-
* Count the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
90-
* @param array $ngramsArray
91-
* @param string $str1
92-
* @param int $pos1
93-
* @param string $str2
94-
* @param int $pos2
95-
* @return int $count return the frequency
93+
* Counts the frequency of each token of an ngram array
94+
* @param string $sep
95+
* @param array $ngrams
96+
* @return array $frequencies Return an array of tokens with its frequencies by its positions
9697
*/
97-
static private function countFreq(array $ngramsArray, string $str1, int $pos1, string $str2 = null, int $pos2 = null) : int
98+
static public function readFreq(string $sep, array $ngrams) : array
9899
{
99-
$count = 0;
100-
101-
//counts the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
102-
foreach ($ngramsArray as $ngramArray) {
103-
if($str1 === $ngramArray[$pos1]) {
104-
if(isset($str2) && isset($pos2)) {
105-
if($str2 === $ngramArray[$pos2]) {
106-
$count++;
107-
}
100+
$ngrams = self::ngramsAsArray($sep, $ngrams);
101+
$frequencies = array();
102+
foreach ($ngrams as $ngram) {
103+
foreach ($ngram as $pos => $token) {
104+
if(isset($frequencies[$token][$pos])) { //checks if the token in that position was already counted
105+
$frequencies[$token][$pos] += 1;
108106
} else {
109-
$count++;
107+
$frequencies[$token][$pos] = 1;
110108
}
111109
}
112110
}
111+
return $frequencies;
112+
}
113+
114+
/**
115+
* Counts the frequency of combo of tokens of an ngram array
116+
* @param string $sep
117+
* @param array $ngrams
118+
* @return array $frequencies Return an array of a combo of tokens with its frequencies by its positions
119+
*/
120+
static public function readCombFreq(string $sep, array $ngrams) : array
121+
{
122+
$ngrams = self::ngramsAsArray($sep, $ngrams);
123+
$frequencies = array();
124+
foreach ($ngrams as $ngram) {
125+
foreach ($ngram as $posToken => $token) {
126+
for ($i = $posToken+1; $i < count($ngram); $i++) {
127+
if(isset($frequencies[$token.$sep.$ngram[$i]][$posToken][$i])) { //checks if the combo already exists
128+
$frequencies[$token.$sep.$ngram[$i]][$posToken][$i] += 1;
129+
} else {
130+
$frequencies[$token.$sep.$ngram[$i]][$posToken][$i] = 1;
131+
}
132+
}
133+
}
113134

114-
return $count;
135+
}
136+
return $frequencies;
115137
}
116138

117139
/**

0 commit comments

Comments
 (0)