@@ -11,18 +11,18 @@ class NGramFactory
11
11
{
12
12
const BIGRAM = 2 ;
13
13
const TRIGRAM = 3 ;
14
-
14
+
15
15
/**
16
- * Protect the constructor
17
- */
16
+ * Protect the constructor
17
+ */
18
18
protected function __construct (){}
19
-
19
+
20
20
/**
21
- * Generate Ngrams from the tokens
22
- * @param array $tokens
23
- * @param int $nGramSize
24
- * @return array return an array of the ngrams
25
- */
21
+ * Generate Ngrams from the tokens
22
+ * @param array $tokens
23
+ * @param int $nGramSize
24
+ * @return array return an array of the ngrams
25
+ */
26
26
static public function create (array $ tokens , $ nGramSize = self ::BIGRAM , $ separator = ' ' ) : array
27
27
{
28
28
$ separatorLength = strlen ($ separator );
@@ -31,17 +31,100 @@ static public function create(array $tokens, $nGramSize = self::BIGRAM, $separat
31
31
return [];
32
32
}
33
33
$ ngrams = array_fill (0 , $ length , '' ); // initialize the array
34
-
34
+
35
35
for ($ index = 0 ; $ index < $ length ; $ index ++)
36
36
{
37
37
for ($ jindex = 0 ; $ jindex < $ nGramSize ; $ jindex ++)
38
38
{
39
- $ ngrams [$ index ] .= $ tokens [$ index + $ jindex ];
40
- if ($ jindex < $ nGramSize - $ separatorLength ) {
39
+ $ ngrams [$ index ] .= $ tokens [$ index + $ jindex ];
40
+ //alterado a condição, pois não considera-se o tamanho do separador e sim a posição do ponteiro em relação ao tamanho do Ngram
41
+ if ($ jindex < $ nGramSize - 1 ) {
41
42
$ ngrams [$ index ] .= $ separator ;
42
43
}
43
44
}
44
45
}
45
- return $ ngrams ;
46
+ return $ ngrams ;
47
+ }
48
+
49
+ /**
50
+ * Set the frenquecies of the ngrams and their respective tokens
51
+ * @param array $ngrams
52
+ * @param string $sep
53
+ * @return array return an array of the ngrams with frequencies
54
+ */
55
+ static public function getFreq (array $ ngrams , string $ sep = ' ' ) : array
56
+ {
57
+ //getting the frequencies of the ngrams array and an array with no repetition
58
+ $ ngramsUnique = array_count_values ($ ngrams );
59
+
60
+ //array to be the product of this function
61
+ $ ngramsFinal = array ();
62
+
63
+ //creates an array of tokens per ngram
64
+ $ ngramsArray = self ::ngramsAsArray ($ sep , $ ngrams );
65
+
66
+ //interate the array with no repeated ngrams
67
+ foreach ($ ngramsUnique as $ ngramString => $ ngramFrequency ) {
68
+ $ ngramsFinal [$ ngramString ] = array ($ ngramFrequency ); //putting into the final array an array of frequencies (first, the ngram frequency)
69
+
70
+ $ ngramArray = explode ($ sep , $ ngramString ); //getting an array of tokens of the ngram
71
+ $ ngramSize = count ($ ngramArray ); //getting the size of ngram
72
+ foreach ($ ngramArray as $ kToken => $ token ) { //iterating the array of tokens of the ngram
73
+ $ ngramsFinal [$ ngramString ][$ kToken +1 ] = self ::countFreq ($ ngramsArray , $ token , $ kToken ); //getting the frequency of the token
74
+
75
+ if ($ ngramSize > 2 ) {
76
+ //getting the combined frequency of the tokens
77
+ for ($ i = $ kToken +1 ; $ i < $ ngramSize ; $ i ++) {
78
+ $ ngramsFinal [$ ngramString ][$ ngramSize +$ kToken +$ i ] = self ::countFreq ($ ngramsArray , $ token , $ kToken , $ ngramArray [$ i ], $ i );
79
+ }
80
+ }
81
+ }
82
+
83
+ }
84
+
85
+ return $ ngramsFinal ;
86
+ }
87
+
88
+ /**
89
+ * Count the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
90
+ * @param array $ngramsArray
91
+ * @param string $str1
92
+ * @param int $pos1
93
+ * @param string $str2
94
+ * @param int $pos2
95
+ * @return int $count return the frequency
96
+ */
97
+ static private function countFreq (array $ ngramsArray , string $ str1 , int $ pos1 , string $ str2 = null , int $ pos2 = null ) : int
98
+ {
99
+ $ count = 0 ;
100
+
101
+ //counts the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
102
+ foreach ($ ngramsArray as $ ngramArray ) {
103
+ if ($ str1 === $ ngramArray [$ pos1 ]) {
104
+ if (isset ($ str2 ) && isset ($ pos2 )) {
105
+ if ($ str2 === $ ngramArray [$ pos2 ]) {
106
+ $ count ++;
107
+ }
108
+ } else {
109
+ $ count ++;
110
+ }
111
+ }
112
+ }
113
+
114
+ return $ count ;
115
+ }
116
+
117
+ /**
118
+ * Transform the ngram array to an array of their tokens
119
+ * @param string $sep
120
+ * @param array $ngrams
121
+ * @return array $ngramsArray
122
+ */
123
+ static private function ngramsAsArray (string $ sep , array $ ngrams ) : array {
124
+ $ ngramsArray = array ();
125
+ foreach ($ ngrams as $ key => $ ngram ) {
126
+ $ ngramsArray [] = explode ($ sep , $ ngram );
127
+ }
128
+ return $ ngramsArray ;
46
129
}
47
130
}
0 commit comments