PaddlePaddle
diff --git a/‎models/match/dssm/data/preprocess.py
+2-1 b/‎models/match/dssm/data/preprocess.py
+2-1
diff --git a/‎models/match/dssm/readme.md
+11-5 b/‎models/match/dssm/readme.md
+11-5
diff --git a/‎models/match/dssm/run.sh
+1-5 b/‎models/match/dssm/run.sh
+1-5
diff --git a/‎models/match/dssm/transform.py
+14 b/‎models/match/dssm/transform.py
+14
diff --git a/‎models/match/match-pyramid/eval.py
+15-1 b/‎models/match/match-pyramid/eval.py
+15-1
diff --git a/‎models/match/match-pyramid/readme.md
+6 b/‎models/match/match-pyramid/readme.md
+6
diff --git a/‎models/match/match-pyramid/run.sh
+1-5 b/‎models/match/match-pyramid/run.sh
+1-5
diff --git a/‎models/match/multiview-simnet/readme.md
+5 b/‎models/match/multiview-simnet/readme.md
+5
diff --git a/‎models/match/multiview-simnet/run.sh
+2-6 b/‎models/match/multiview-simnet/run.sh
+2-6
diff --git a/‎models/match/multiview-simnet/transform.py
+16-2 b/‎models/match/multiview-simnet/transform.py
+16-2
diff --git a/‎models/match/readme.md
+2-2 b/‎models/match/readme.md
+2-2
diff --git a/‎models/rank/deepfm/config.yaml
+15-12 b/‎models/rank/deepfm/config.yaml
+15-12
diff --git a/‎models/rank/deepfm/data/download_preprocess.py
+1-1 b/‎models/rank/deepfm/data/download_preprocess.py
+1-1
diff --git a/‎models/rank/deepfm/data/get_slot_data.py
+1-2 b/‎models/rank/deepfm/data/get_slot_data.py
+1-2
diff --git a/‎models/rank/deepfm/data/run.sh
+1-1 b/‎models/rank/deepfm/data/run.sh
+1-1
diff --git a/‎models/rank/deepfm/picture/1.jpg
6.64 KB b/‎models/rank/deepfm/picture/1.jpg
6.64 KB
diff --git a/‎models/rank/deepfm/picture/2.jpg
4.12 KB b/‎models/rank/deepfm/picture/2.jpg
4.12 KB
diff --git a/‎models/rank/deepfm/picture/3.jpg
11.3 KB b/‎models/rank/deepfm/picture/3.jpg
11.3 KB
diff --git a/‎models/rank/deepfm/picture/4.jpg
25.8 KB b/‎models/rank/deepfm/picture/4.jpg
25.8 KB
@@ -63,7 +63,8 @@
 #划分训练集和测试集
 query_list = list(pos_dict.keys())
 #print(len(query_list))
-#random.shuffle(query_list)
+np.random.seed(107)
+np.random.shuffle(query_list)
 train_query = query_list[:11600]
 test_query = query_list[11600:]
 
 
@@ -60,6 +60,12 @@ rm -f dssm%2Fbq.tar.gz
 已经在银行换了新预留号码。      我现在换了电话号码，这个需要更换吗      1
 每个字段以tab键分隔，第1，2列表示两个文本。第3列表示类别（0或1，0表示两个文本不相似，1表示两个文本相似）。
 ```
+在本例中需要调用jieba库和sklearn库，如环境中没有提前安装，可以使用以下命令安装。  
+```
+pip install sklearn
+pip install jieba
+```
+
 ## 运行环境
 PaddlePaddle>=1.7.2
 
@@ -153,11 +159,11 @@ label.txt中对应的测试集中的标签
 4. 退回dssm目录中，打开文件config.yaml,更改其中的参数  
 
 将workspace改为您当前的绝对路径。（可用pwd命令获取绝对路径）  
-将dataset_train中的batch_size从8改为128
-将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候，这个参数也需要随之变化
-将dataset_train中的data_path改为{workspace}/data/big_train
-将dataset_infer中的data_path改为{workspace}/data/big_test
-将hyper_parameters中的trigram_d改为5913
+将dataset_train中的batch_size从8改为128  
+将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候，这个参数也需要随之变化  
+将dataset_train中的data_path改为{workspace}/data/big_train  
+将dataset_infer中的data_path改为{workspace}/data/big_test  
+将hyper_parameters中的trigram_d改为5913  
 
 5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动transform.py整合数据，最后计算出正逆序指标：
 ```
 
@@ -13,11 +13,7 @@
 # limitations under the License.
 #!/bin/bash
 echo "................run................."
-python -m paddlerec.run -m ./config.yaml &> result1.txt
-grep -i "query_doc_sim" ./result1.txt >./result2.txt
-sed '$d' result2.txt >result.txt
-rm -f result1.txt
-rm -f result2.txt
+python -m paddlerec.run -m ./config.yaml &> result.txt
 python transform.py
 sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
 rm -f pair.txt
 
@@ -16,6 +16,20 @@
 import numpy as np
 import sklearn.metrics
 
+filename = './result.txt'
+f = open(filename, "r")
+lines = f.readlines()
+f.close()
+result = []
+for line in lines:
+    if "query_doc_sim" in str(line):
+        result.append(line)
+result = result[:-1]
+f = open(filename, "w")
+for i in range(len(result)):
+    f.write(str(result[i]))
+f.close()
+
 label = []
 filename = './data/label.txt'
 f = open(filename, "r")
 
@@ -32,6 +32,20 @@ def eval_MAP(pred, gt):
         return map_value / r
 
 
+filename = './result.txt'
+f = open(filename, "r")
+lines = f.readlines()
+f.close()
+result = []
+for line in lines:
+    if "prediction" in str(line):
+        result.append(line)
+result = result[:-1]
+f = open(filename, "w")
+for i in range(len(result)):
+    f.write(str(result[i]))
+f.close()
+
 filename = './data/relation.test.fold1.txt'
 gt = []
 qid = []
@@ -56,7 +70,7 @@ def eval_MAP(pred, gt):
     pred.append(float(line))
 
 result_dict = {}
-for i in range(len(qid)):
+for i in range(len(pred)):
     if qid[i] not in result_dict:
         result_dict[qid[i]] = []
     result_dict[qid[i]].append([gt[i], pred[i]])
 
@@ -55,6 +55,12 @@
 3.关系文件：关系文件被用来存储两个句子之间的关系，如query 和document之间的关系。例如：relation.train.fold1.txt, relation.test.fold1.txt  
 4.嵌入层文件：我们将预训练的词向量存储在嵌入文件中。例如：embed_wiki-pdc_d50_norm  
 
+在本例中需要调用jieba库和sklearn库，如环境中没有提前安装，可以使用以下命令安装。  
+```
+pip install sklearn
+pip install jieba
+```
+
 ## 运行环境
 PaddlePaddle>=1.7.2  
 python 2.7/3.5/3.6/3.7  
 
@@ -1,8 +1,4 @@
 #!/bin/bash
 echo "................run................."
-python -m paddlerec.run -m ./config.yaml &>result1.txt
-grep -i "prediction" ./result1.txt >./result2.txt
-sed '$d' result2.txt >result.txt
-rm -f result2.txt
-rm -f result1.txt
+python -m paddlerec.run -m ./config.yaml &>result.txt
 python eval.py
@@ -61,6 +61,11 @@ rm -f dssm%2Fbq.tar.gz
 0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
 0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
 ```
+在本例中需要调用jieba库和sklearn库，如环境中没有提前安装，可以使用以下命令安装。  
+```
+pip install sklearn
+pip install jieba
+```
 
 ## 运行环境
 PaddlePaddle>=1.7.2  
 
@@ -14,12 +14,8 @@
 
 #!/bin/bash
 echo "................run................."
-python -m paddlerec.run -m ./config.yaml &>result1.txt
-grep -i "query_pt_sim" ./result1.txt >./result2.txt
-sed '$d' result2.txt >result.txt
-rm -f result1.txt
-rm -f result2.txt
+python -m paddlerec.run -m ./config.yaml &>result.txt
 python transform.py
-sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
+sort -t $'\t' -k1,1 -k 2nr,2 pair.txt &>result.txt
 rm -f pair.txt
 python ../../../tools/cal_pos_neg.py result.txt
@@ -15,6 +15,20 @@
 import random
 import numpy as np
 
+filename = './result.txt'
+f = open(filename, "r")
+lines = f.readlines()
+f.close()
+result = []
+for line in lines:
+    if "query_pt_sim" in str(line):
+        result.append(line)
+result = result[:-1]
+f = open(filename, "w")
+for i in range(len(result)):
+    f.write(str(result[i]))
+f.close()
+
 label = []
 filename = './data/label.txt'
 f = open(filename, "r")
@@ -31,7 +45,7 @@
 sim = []
 for line in open(filename):
     line = line.strip().split(",")
-    print(line)
+    #print(line)
     line[3] = line[3].split(":")
     line = line[3][1].strip(" ")
     line = line.strip("[")
@@ -50,6 +64,6 @@
 filename = 'pair.txt'
 f = open(filename, "w")
 for i in range(len(sim)):
-    print(i)
+    #print(i)
     f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
 f.close()
@@ -53,6 +53,6 @@ python -m paddlerec.run -m models/contentunderstanding/match-pyramid/config.yaml
 
 |       数据集        |       模型       |      正逆序比          |       map       |  
 | :------------------: | :--------------------: | :---------: |:---------: |
-|       zhidao       |       DSSM       |       2.25        |       --          | 
+|       zhidao       |       DSSM       |       2.75        |       --          | 
 |       Letor07        |       match-pyramid       |       --        |      0.42          | 
-|       zhidao        |       multiview-simnet       |       1.72        |       --          |
+|       zhidao        |       multiview-simnet       |       13.67        |       --          |
@@ -19,22 +19,22 @@ workspace: "models/rank/deepfm"
 
 dataset:
   - name: train_sample
-    type: QueueDataset
+    type: DataLoader
     batch_size: 5
     data_path: "{workspace}/data/sample_data/train"
     sparse_slots: "label feat_idx"
     dense_slots: "feat_value:39"
   - name: infer_sample
-    type: QueueDataset
+    type: DataLoader
     batch_size: 5
     data_path: "{workspace}/data/sample_data/train"
     sparse_slots: "label feat_idx"
     dense_slots: "feat_value:39"
 
 hyper_parameters:
     optimizer:
-        class: SGD
-        learning_rate: 0.0001
+        class: Adam
+        learning_rate: 0.001
     sparse_feature_number: 1086460
     sparse_feature_dim: 9
     num_field: 39
@@ -43,7 +43,7 @@ hyper_parameters:
     act: "relu"
 
 
-mode: train_runner
+mode: [train_runner,infer_runner]
 # if infer, change mode to "infer_runner" and change phase to "infer_phase"
 
 runner:
@@ -57,19 +57,22 @@ runner:
     save_checkpoint_path: "increment"
     save_inference_path: "inference"
     print_interval: 1
+    phases: phase1
   - name: infer_runner
     class: infer
     device: cpu
-    init_model_path: "increment/0"
+    init_model_path: "increment/1"
     print_interval: 1
-    
+    phases: infer_phase
+
+
 
 phase:
 - name: phase1
   model: "{workspace}/model.py"
   dataset_name: train_sample
-  thread_num: 1
-#- name: infer_phase
-#  model: "{workspace}/model.py"
-#  dataset_name: infer_sample
-#  thread_num: 1
+  thread_num: 10
+- name: infer_phase
+  model: "{workspace}/model.py"
+  dataset_name: infer_sample
+  thread_num: 10
@@ -28,7 +28,7 @@
 
     print("download and extract starting...")
     download_file_and_uncompress(url)
-    download_file(url2, "./sample_data/feat_dict_10.pkl2", True)
+    download_file(url2, "./deepfm%2Ffeat_dict_10.pkl2", True)
     print("download and extract finished")
 
     print("preprocessing...")
 
@@ -79,8 +79,7 @@ def data_iter():
                 v = i[1]
                 for j in v:
                     s += " " + k + ":" + str(j)
-            print(s.strip())
-            yield None
+            print(s.strip())  # add print for data preprocessing
 
         return data_iter
 
 
@@ -1,5 +1,5 @@
 python download_preprocess.py 
-
+mv ./deepfm%2Ffeat_dict_10.pkl2 sample_data/feat_dict_10.pkl2
 mkdir slot_train_data
 for i in `ls ./train_data`
 do