Skip to content

Commit cfe00a7

Browse files
authored
Merge pull request #143 from KenelmQLH/update_doc
[DOC] Update demo for knowledge prediction and paper segmentaion
2 parents 214edcf + a3c281c commit cfe00a7

File tree

11 files changed

+1196
-7
lines changed

11 files changed

+1196
-7
lines changed

EduNLP/Pretrain/disenqnet_vec.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ def preprocess_dataset(pretrained_dir, disen_tokenizer, items, data_formation, t
161161
if not os.path.exists(concept_list_path):
162162
concepts = set()
163163
for data in items:
164-
print(data)
165164
concept = data[data_formation["knowledge"]]
166165
for c in concept:
167166
if c not in concepts:

EduNLP/Tokenizer/tokenizer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs):
115115

116116

117117
class PureTextTokenizer(Tokenizer):
118-
def __init__(self, handle_figure_formula="skip", **kwargs):
118+
def __init__(self, symbol="gmas", handle_figure_formula="skip", **kwargs):
119119
"""
120120
Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text.
121121
@@ -184,13 +184,14 @@ def __init__(self, handle_figure_formula="skip", **kwargs):
184184
"text_params": text_params,
185185
"figure_params": kwargs.get("figure_params", None)
186186
}
187+
self.symbol = symbol
187188

188189
def __call__(self, items: Iterable, key=lambda x: x, **kwargs):
189190
for item in items:
190191
yield self._tokenize(item, key=key, **kwargs)
191192

192193
def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs):
193-
return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens
194+
return tokenize(seg(key(item), symbol=self.symbol), **self.tokenization_params, **kwargs).tokens
194195

195196

196197
class AstFormulaTokenizer(Tokenizer):
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
{
2+
"cells": [
3+
{
4+
"attachments": {},
5+
"cell_type": "markdown",
6+
"metadata": {},
7+
"source": [
8+
"# 层级知识点预测"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"metadata": {},
15+
"outputs": [],
16+
"source": [
17+
"import os\n",
18+
"import yaml\n",
19+
"import tqdm\n",
20+
"import torch\n",
21+
"import numpy as np\n",
22+
"from EduNLP.Pretrain import BertTokenizer\n",
23+
"from EduNLP.ModelZoo.bert import BertForKnowledgePrediction\n",
24+
"from EduNLP.Pretrain import finetune_bert_for_knowledge_prediction\n",
25+
"from EduNLP.ModelZoo import load_items\n",
26+
"\n",
27+
"from utils import compute_perfs_per_layer, get_onehot_label_topk, metric, compute_perfs"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": null,
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"train_data = load_items(\"/path/to/data/train.jsonl\")\n",
37+
"test_data = load_items(\"/path/to/data/test.jsonl\")\n",
38+
"\n",
39+
"pretrained_model_dir =\"/path/to/bert/checkpoint\"\n",
40+
"checkpoint_dir = \"/path/to/knowledge_model/checkpoint\""
41+
]
42+
},
43+
{
44+
"attachments": {},
45+
"cell_type": "markdown",
46+
"metadata": {},
47+
"source": [
48+
"## 训练"
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": null,
54+
"metadata": {},
55+
"outputs": [],
56+
"source": [
57+
"# 以bert为例\n",
58+
"data_params = {\n",
59+
" \"stem_key\": \"ques_content\",\n",
60+
" \"label_key\": \"know_list\"\n",
61+
"}\n",
62+
"train_params = {\n",
63+
" \"num_train_epochs\": 1,\n",
64+
" \"per_device_train_batch_size\": 2,\n",
65+
" \"per_device_eval_batch_size\": 2,\n",
66+
" \"no_cuda\": True,\n",
67+
"}\n",
68+
"model_params = {\n",
69+
" \"num_classes_list\": [10, 27, 963],\n",
70+
" \"num_total_classes\": 1000,\n",
71+
"}\n",
72+
" \n",
73+
"\n",
74+
"\"\"\"\n",
75+
"数据格式:\n",
76+
"{\n",
77+
" 'ques_content': 'question...',\n",
78+
" 'know_list': [lay_1_id, lay_2_id, lay_3_id]\n",
79+
"}\n",
80+
"\"\"\"\n",
81+
"\n",
82+
"# train without eval_items\n",
83+
"finetune_bert_for_knowledge_prediction(\n",
84+
" train_data,\n",
85+
" checkpoint_dir,\n",
86+
" pretrained_model=pretrained_model_dir,\n",
87+
" train_params=train_params,\n",
88+
" data_params=data_params,\n",
89+
" model_params=model_params\n",
90+
")"
91+
]
92+
},
93+
{
94+
"attachments": {},
95+
"cell_type": "markdown",
96+
"metadata": {},
97+
"source": [
98+
"## 加载模型和评估数据"
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": null,
104+
"metadata": {},
105+
"outputs": [],
106+
"source": [
107+
"# 针对多标签任务处理标签\n",
108+
"class EvalDataset(torch.utils.data.Dataset):\n",
109+
" def __init__(self, data) -> None:\n",
110+
" self.data = data\n",
111+
" self.num_classes = model_params['num_classes_list']\n",
112+
" self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n",
113+
"\n",
114+
" def __getitem__(self, idx):\n",
115+
" text, labels = self.data[idx][\"ques_content\"], self.data[idx][\"know_list\"]\n",
116+
" encodings = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')\n",
117+
" for k, v in encodings.items():\n",
118+
" encodings[k] = torch.squeeze(v, dim=0)\n",
119+
" one_hot_labels = [1. if idx in labels else 0. for idx in range(self.num_classes)]\n",
120+
" return encodings, torch.FloatTensor(one_hot_labels)\n",
121+
"\n",
122+
" def __len__(self):\n",
123+
" return len(self.data)\n",
124+
"\n",
125+
"test_dataset = EvalDataset(test_data)\n",
126+
"eval_dataloader = EvalDataset(\n",
127+
" test_data,\n",
128+
" batch_size=1,\n",
129+
" shuffle=False,\n",
130+
" num_workers=4,\n",
131+
")\n",
132+
"\n",
133+
"model = BertForKnowledgePrediction.from_pretrained(checkpoint_dir)"
134+
]
135+
},
136+
{
137+
"attachments": {},
138+
"cell_type": "markdown",
139+
"metadata": {},
140+
"source": [
141+
"## 评估"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": null,
147+
"metadata": {},
148+
"outputs": [],
149+
"source": [
150+
"device = \"cuda\" if not train_params[\"no_cuda\"] else \"cpu\"\n",
151+
"\n",
152+
"# 层级知识标签-配置信息\n",
153+
"levels = len(model_params[\"num_classes_list\"])\n",
154+
"classes_offset_list = [0, 10, 37]\n",
155+
"classes_border_list = [[0, 9], [10, 36], [37, 1000]] # 层级id边界\n",
156+
"hierarchy_dict = {} # child_know_id_to_parent_know_id\n",
157+
"\n",
158+
"# 评估top_k结果\n",
159+
"top_k_list=[10, 20, 30]\n",
160+
"\n",
161+
"model.eval()\n",
162+
"perfs_per_layer = [np.array([0 for _ in range(4)], dtype=np.int32) for _ in range(levels)]\n",
163+
"total_perfs = np.array([0 for _ in range(4)], dtype=np.int32)\n",
164+
"\n",
165+
"k_total_perfs_list = [ np.array([0 for _ in range(4)], dtype=np.int32)for _ in range(len(top_k_list)) ]\n",
166+
"for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n",
167+
" input_data, eval_batch_labels = eval_batch\n",
168+
" input_data = input_data.to(device)\n",
169+
" _, output_logits = model(**input_data)\n",
170+
"\n",
171+
" local_perfs_per_layer, local_perfs = compute_perfs_per_layer(\n",
172+
" output_logits.cpu().detach().numpy(),\n",
173+
" eval_batch_labels.cpu().detach().numpy(),\n",
174+
" hierarchy_dict,\n",
175+
" classes_border_list,\n",
176+
" keep_consistency=True\n",
177+
" )\n",
178+
" perfs_per_layer = [perfs_per_layer[idx] + local_perfs_per_layer[idx] for idx in range(levels)]\n",
179+
" total_perfs += local_perfs\n",
180+
" \n",
181+
" # for recall@k\n",
182+
" for i_k, k in enumerate(top_k_list):\n",
183+
" pred_topk = get_onehot_label_topk(\n",
184+
" classes_border_list, classes_offset_list, scores_list=output_logits.cpu().detach().numpy(), top_num=k)\n",
185+
" flat_pred_topk = np.array([x[3] for x in pred_topk])\n",
186+
" k_total_perfs = compute_perfs(flat_pred_topk, eval_batch_labels.cpu().detach().numpy().tolist())\n",
187+
" k_total_perfs_list[i_k] += k_total_perfs\n",
188+
"\n",
189+
"# metric for overall\n",
190+
"micro_precision, micro_recall, micro_f1, total_acc = metric(*total_perfs)\n",
191+
"print(f\"Eval Results: Micro-Precision: {micro_precision:.4f}, \"\n",
192+
" + f\"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}\")\n",
193+
"\n",
194+
"# metrics for per top_k\n",
195+
"for i_k, k_total_perfs in enumerate(k_total_perfs_list):\n",
196+
" k = top_k_list[i_k]\n",
197+
" precision, recall, f1, acc = metric(*k_total_perfs)\n",
198+
" print(f\"TOPK={k}: Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, F1@{k}: {f1:.4f}, Acc@{k}: {acc:.4f}\")\n",
199+
"\n",
200+
"# metrics for per layer\n",
201+
"for layer_idx, perfs in enumerate(perfs_per_layer):\n",
202+
" precision, recall, f1, acc = metric(*perfs)\n",
203+
" print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")"
204+
]
205+
}
206+
],
207+
"metadata": {
208+
"language_info": {
209+
"name": "python"
210+
},
211+
"orig_nbformat": 4
212+
},
213+
"nbformat": 4,
214+
"nbformat_minor": 2
215+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import numpy as np
2+
import torch
3+
import heapq
4+
from EduNLP.Pretrain import BertTokenizer
5+
6+
7+
def get_onehot_label_topk(classes_border_list, classes_offset_list, scores_list: np.ndarray, top_num=1):
8+
"""
9+
Get the predicted labels based on the topK.
10+
11+
Args:
12+
classes_border_list
13+
classes_offset_list
14+
scores_list: The all classes predicted scores provided by network
15+
top_num: The max topK number (default: 5)
16+
Returns:
17+
predicted_onehot_labels: The predicted labels (onehot)
18+
"""
19+
pred_onehot_labels = []
20+
scores_list = np.ndarray.tolist(scores_list)
21+
border, offset = classes_border_list, classes_offset_list
22+
num_level = len(border)
23+
for scores in scores_list:
24+
onehot_labels_list = [0] * len(scores)
25+
hlabels = {}
26+
for level in range(num_level):
27+
begin, end = border[level][0], border[level][1]
28+
cur_scores = scores[begin: end + 1]
29+
cur_offset = offset[level]
30+
cur_onehot_labels_list = [0] * len(cur_scores)
31+
# pred_onehot_scores[level].append(cur_scores)
32+
max_num_index_list = list(map(cur_scores.index, heapq.nlargest(top_num, cur_scores)))
33+
for i in max_num_index_list:
34+
cur_onehot_labels_list[i] = 1
35+
onehot_labels_list[i + cur_offset] = 1
36+
hlabels[level] = cur_onehot_labels_list
37+
# pred_onehot_scores[-1].append(scores)
38+
hlabels[num_level] = onehot_labels_list
39+
pred_onehot_labels.append(hlabels)
40+
return pred_onehot_labels
41+
42+
43+
def compute_perfs(pred_labels: np.ndarray, true_labels: np.ndarray) -> tuple:
44+
# TP: number of labels which is predicted as True and is actually True.
45+
TP = np.sum(pred_labels * true_labels)
46+
# FP: number of labels which is predicted as True and is actually False.
47+
FP = np.sum(((pred_labels - true_labels) > 0).astype(np.int32))
48+
# FN: number of labels which is predicted as False and is actually True.
49+
FN = np.sum(((true_labels - pred_labels) > 0).astype(np.int32))
50+
# FP: number of labels which is predicted as False and is actually False.
51+
TN = np.sum(((pred_labels + true_labels) == 0).astype(np.int32))
52+
return np.array([TP, FP, FN, TN], dtype=np.int32)
53+
54+
55+
def compute_perfs_per_layer(outputs: np.ndarray, true_labels: np.ndarray, hierarchy: dict, classes_border_list: list, keep_consistency: bool = True, threshold=0.5) -> tuple:
56+
def _make_labels_consistent(input_labels: np.ndarray, hierarchy: dict):
57+
input_labels = input_labels.astype(np.int32)
58+
while len(hierarchy) > 0:
59+
bottom_labels = set(hierarchy.keys()) - set(hierarchy.values())
60+
for child in bottom_labels:
61+
mask = (input_labels[:, child] == 1).astype(np.int32)
62+
input_labels[:, hierarchy[child]] |= mask
63+
for k in bottom_labels:
64+
hierarchy.pop(k)
65+
return input_labels
66+
67+
preds = []
68+
for (start, end) in classes_border_list:
69+
threshold_labels = (outputs[:, start:end + 1] >= threshold).astype(np.int32)
70+
max_labels = (outputs[:, start:end + 1] == outputs[:, start:end + 1].max(axis=1)[:,None]).astype(np.int32)
71+
preds.append(threshold_labels | max_labels)
72+
pred_labels = np.concatenate(preds, axis=-1)
73+
del preds
74+
if keep_consistency:
75+
pred_labels = _make_labels_consistent(pred_labels, hierarchy.copy())
76+
true_labels = _make_labels_consistent(true_labels, hierarchy.copy())
77+
# get perfs per layer
78+
perfs_per_layer = []
79+
for (start, end) in classes_border_list:
80+
perfs_per_layer.append(compute_perfs(pred_labels[:, start:end + 1], true_labels[:, start:end + 1]))
81+
total_perfs = compute_perfs(pred_labels, true_labels)
82+
return perfs_per_layer, total_perfs
83+
84+
85+
def compute_topk_recall(topk_preds: list, true_labels: list) -> tuple:
86+
rs = []
87+
for pred, label in zip(topk_preds, true_labels):
88+
_r = len(set(pred) & set(label)) / len(label)
89+
rs.append(_r)
90+
return np.mean(rs)
91+
92+
93+
def quantile(array: torch.Tensor, ratio: float):
94+
"""
95+
get quantile of array
96+
"""
97+
assert ratio >= 0 and ratio <= 1
98+
assert len(array.shape) == 1
99+
sorted_array = torch.sort(array, dim=-1, descending=True)[0]
100+
index = min(int(len(array) * ratio + 0.5), len(array))
101+
return sorted_array[index].item()
102+
103+
104+
def metric(TP, FP, FN, TN):
105+
def _f1_score(precision, recall):
106+
if precision + recall == 0:
107+
return 0.
108+
else:
109+
return 2 * precision * recall / (precision + recall)
110+
if TP + FP == 0:
111+
precision = 0
112+
else:
113+
precision = TP / (TP + FP)
114+
if TP + FN == 0:
115+
recall = 0
116+
else:
117+
recall = TP / (TP + FN)
118+
micro_f1 = _f1_score(precision, recall)
119+
acc = (TP + TN) / (TP + FP + FN + TN)
120+
return precision, recall, micro_f1, acc

0 commit comments

Comments
 (0)