Skip to content

Commit 81efd4b

Browse files
committed
新增模型字段到CreateDatasetInput类,优化FineTuningService以支持模型参数,更新前端页面以允许用户选择和导出数据集,添加微调任务进度对话框和整体数据集导出功能。
1 parent bd6e5fe commit 81efd4b

File tree

4 files changed

+366
-30
lines changed

4 files changed

+366
-30
lines changed

src/KoalaWiki/Dto/FineTuningDto.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public class CreateDatasetInput
3535
/// 提示词
3636
/// </summary>
3737
public string Prompt { get; set; } = string.Empty;
38+
39+
public string Model { get; set; } = string.Empty;
3840
}
3941

4042
/// <summary>

src/KoalaWiki/Services/FineTuningService.cs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Text;
22
using System.Text.Json;
3+
using System.Text.RegularExpressions;
34
using FastService;
45
using KoalaWiki.Core.DataAccess;
56
using KoalaWiki.Domains;
@@ -48,6 +49,7 @@ public async Task<TrainingDataset> CreateDatasetAsync(CreateDatasetInput input)
4849
Status = TrainingDatasetStatus.NotStarted,
4950
Name = input.Name,
5051
Endpoint = input.Endpoint,
52+
Model = input.Model,
5153
ApiKey = input.ApiKey,
5254
Prompt = input.Prompt
5355
};
@@ -293,11 +295,37 @@ await context.Response.WriteAsync($"data: {JsonSerializer.Serialize(new
293295
await context.Response.Body.FlushAsync();
294296
}
295297

296-
await koala.FineTuningTasks.Where(x => x.Id == dataset.Id)
298+
var datasetContent = sb.ToString();
299+
300+
// 提取<data>标签中的内容
301+
var regex = new Regex(@"<data>(.*?)</data>", RegexOptions.Singleline);
302+
303+
var match = regex.Match(datasetContent);
304+
if (match.Success)
305+
{
306+
datasetContent = match.Groups[1].Value;
307+
}
308+
309+
// 提取```json```标签中的内容
310+
regex = new Regex(@"```json(.*?)```", RegexOptions.Singleline);
311+
match = regex.Match(datasetContent);
312+
if (match.Success)
313+
{
314+
datasetContent = match.Groups[1].Value;
315+
}
316+
317+
318+
await koala.FineTuningTasks.Where(x => x.Id == task.Id)
297319
.ExecuteUpdateAsync(x => x.SetProperty(a => a.Status, FineTuningTaskStatus.Completed)
298-
.SetProperty(x => x.Dataset, sb.ToString())
320+
.SetProperty(x => x.Dataset, datasetContent)
321+
.SetProperty(x=>x.OriginalDataset,sb.ToString())
299322
.SetProperty(a => a.CompletedAt, DateTime.Now),
300323
context.RequestAborted);
324+
325+
await koala.TrainingDatasets.Where(x => x.Id == dataset.Id)
326+
.ExecuteUpdateAsync(x => x.SetProperty(a => a.Status, TrainingDatasetStatus.Completed)
327+
.SetProperty(a => a.UpdatedAt, DateTime.Now),
328+
context.RequestAborted);
301329

302330
await context.Response.WriteAsync($"data: {JsonSerializer.Serialize(new
303331
{

web/app/admin/finetune/create/page.tsx

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,9 @@ export default function CreateDatasetPage() {
214214
<Select
215215
placeholder="选择用于生成数据集的模型"
216216
loading={modelLoading}
217+
// 支持搜索
218+
showSearch
219+
// 支持自己填写模型
217220
notFoundContent={modelLoading ? <Spin size="small" /> : (
218221
models.length === 0 ? "请先填写API端点和密钥并刷新" : "没有找到模型"
219222
)}
@@ -238,7 +241,7 @@ Here is the Markdown content you need to process:
238241
{{markdown_content}}
239242
</markdown_content>
240243
241-
Your task is to create 10-15 high-quality instruction-response pairs based on this Markdown document. These pairs will be used to train an AI model, so they should be diverse, comprehensive, and accurately reflect the content of the document.
244+
Your task is to create 15-20 high-quality instruction-response pairs based on this Markdown document. These pairs will be used to train an AI model, so they should be diverse, comprehensive, and accurately reflect the content of the document.
242245
243246
When creating instructions:
244247
1. Use a variety of formats (questions, commands, requests, etc.)
@@ -254,8 +257,7 @@ When creating responses:
254257
255258
Ensure that your dataset comprehensively covers all core content and knowledge structures in the document.
256259
257-
Your output should be in the following JSON format:
258-
260+
Your output should be in the following JSON format using <data> tags:
259261
<data>
260262
[
261263
{"instruction": "Instruction content", "input": "", "output": "Response content"},

0 commit comments

Comments
 (0)