diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/SmoothNLP.iml b/.idea/SmoothNLP.iml
new file mode 100644
index 0000000..f409635
--- /dev/null
+++ b/.idea/SmoothNLP.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..abf150f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..4e1828e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..32b9dc1
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/smoothnlp/algorithm/phrase/ngram_utils.py b/smoothnlp/algorithm/phrase/ngram_utils.py
index 0c33b32..0cb74a3 100644
--- a/smoothnlp/algorithm/phrase/ngram_utils.py
+++ b/smoothnlp/algorithm/phrase/ngram_utils.py
@@ -206,7 +206,8 @@ def get_scores(corpus,
left_right_entropy[word][0], #left_entropy
left_right_entropy[word][1], #right_entropy
min(left_right_entropy[word][0],left_right_entropy[word][1]), #branch entropy BE=min{left_entropy,right_entropy}
- word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1] #our score
+ word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1], #our score
+ ngram_freq[word] # word frequency
)
for word in joint_phrase}
diff --git a/smoothnlp/algorithm/phrase/phrase_extraction.py b/smoothnlp/algorithm/phrase/phrase_extraction.py
index 6f4acf8..83fd8a2 100644
--- a/smoothnlp/algorithm/phrase/phrase_extraction.py
+++ b/smoothnlp/algorithm/phrase/phrase_extraction.py
@@ -39,14 +39,16 @@ def extract_phrase(corpus,
chunk_size: int = 1000000,
min_n:int = 2,
max_n:int=4,
- min_freq:int = 5):
+ min_freq:int = 5,
+ order_by: str = 'score'):
'''
- 取前k个new words或前k%的new words
+ 按score或者freq取前k个new words或前k%的new words
:param corpus:
:param top_k:
:param chunk_size:
:param max_n:
:param min_freq:
+ :param order_by:
:return:
'''
if isinstance(corpus,str):
@@ -57,7 +59,10 @@ def extract_phrase(corpus,
else:
corpus_splits = chunk_generator_adapter(corpus, chunk_size)
word_info_scores = get_scores(corpus_splits,min_n,max_n,chunk_size,min_freq)
- new_words = [item[0] for item in sorted(word_info_scores.items(),key=lambda item:item[1][-1],reverse = True)]
+ if order_by == 'score':
+ new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-2], reverse=True)]
+ elif order_by == 'freq':
+ new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-1], reverse=True)]
if top_k > 1: #输出前k个词
return new_words[:top_k]
elif top_k < 1: #输出前k%的词