labring
diff --git a/‎docSite/assets/imgs/datasetSetting1.png‎
54.3 KB b/‎docSite/assets/imgs/datasetSetting1.png‎
54.3 KB
diff --git a/‎docSite/content/docs/installation/upgrading/46.md‎
Lines changed: 3 additions & 2 deletions b/‎docSite/content/docs/installation/upgrading/46.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docSite/content/docs/pricing.md‎
Lines changed: 4 additions & 4 deletions b/‎docSite/content/docs/pricing.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docSite/content/docs/use-cases/datasetEngine.md‎
Lines changed: 14 additions & 6 deletions b/‎docSite/content/docs/use-cases/datasetEngine.md‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎packages/global/common/string/textSplitter.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/global/common/string/textSplitter.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/global/core/dataset/type.d.ts‎
Lines changed: 4 additions & 2 deletions b/‎packages/global/core/dataset/type.d.ts‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎packages/global/support/wallet/bill/api.d.ts‎
Lines changed: 2 additions & 0 deletions b/‎packages/global/support/wallet/bill/api.d.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/service/core/app/schema.ts‎
Lines changed: 0 additions & 1 deletion b/‎packages/service/core/app/schema.ts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎packages/service/core/dataset/collection/schema.ts‎
Lines changed: 0 additions & 1 deletion b/‎packages/service/core/dataset/collection/schema.ts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎packages/service/core/dataset/schema.ts‎
Lines changed: 5 additions & 0 deletions b/‎packages/service/core/dataset/schema.ts‎
Lines changed: 5 additions & 0 deletions
@@ -50,5 +50,6 @@ curl --location --request POST 'https://{{host}}/api/admin/initv46-2' \
 1. 新增 - 团队空间
 2. 新增 - 多路向量（多个向量映射一组数据）
 3. 新增 - tts语音
-4. 线上环境新增 - ReRank向量召回，提高召回精度
-5. 优化 - 知识库导出，可直接触发流下载，无需等待转圈圈
+4. 新增 - 支持知识库配置文本预处理模型
+5. 线上环境新增 - ReRank向量召回，提高召回精度
+6. 优化 - 知识库导出，可直接触发流下载，无需等待转圈圈
@@ -1,10 +1,10 @@
 ---
-title: '定价'
-description: 'FastGPT 的定价'
+title: '线上版定价'
+description: 'FastGPT 线上版定价'
 icon: 'currency_yen'
 draft: false
 toc: true
-weight: 10
+weight: 11
 ---
 
 ## Tokens 说明
@@ -15,7 +15,7 @@ weight: 10
 
 ## FastGPT 线上计费
 
-目前，FastGPT 线上计费也仅按 Tokens 使用数量为准。以下是详细的计费表（最新定价以线上表格为准，可在点击充值后实时获取）：
+使用: [https://fastgpt.run](https://fastgpt.run) 或  [https://ai.fastgpt.in](https://ai.fastgpt.in) 只需仅按 Tokens 使用数量扣费即可。可在 账号-使用记录 中查看具体使用情况，以下是详细的计费表（最新定价以线上表格为准，可在点击充值后实时获取）：
 
 {{< table "table-hover table-striped-columns" >}}
 | 计费项                 | 价格: 元/ 1K tokens（包含上下文） |
 
@@ -1,6 +1,6 @@
 ---
 title: "知识库结构讲解"
-description: "本节会介绍 FastGPT 知识库结构设计，理解其 QA 的存储格式和检索格式，以便更好的构建知识库。这篇介绍主要以使用为主，详细原理不多介绍。"
+description: "本节会详细介绍 FastGPT 知识库结构设计，理解其 QA 的存储格式和多向量映射，以便更好的构建知识库。这篇介绍主要以使用为主，详细原理不多介绍。"
 icon: "dataset"
 draft: false
 toc: true
@@ -25,13 +25,21 @@ FastGPT 采用了 RAG 中的 Embedding 方案构建知识库，要使用好 Fast
 
 FastGPT 采用了 `PostgresSQL` 的 `PG Vector` 插件作为向量检索器，索引为`HNSW`。且`PostgresSQL`仅用于向量检索，`MongoDB`用于其他数据的存取。
 
-在`PostgresSQL`的表中，设置一个 `index` 字段用于存储向量、一个 `q` 字段用于存储向量对应的内容，以及一个 `a` 字段用于检索映射。之所以取字段为 `qa` 是由于一些历史缘故，无需完全解为 “问答对” 的格式。在实际使用过程中，可以利用`q`和`a`的组合，对检索后的内容做进一步的声明，提高大模型的理解力（注意，这里不直接提高搜索精度）。
+在`PostgresSQL`的表中，设置一个 `index` 字段用于存储向量，以及一个`data_id`用于在`MongoDB`中寻找对应的映射值。多个`index`可以对应一组`data_id`，也就是说，一组向量可以对应多组数据。在进行检索时，相同数据会进行合并。
 
-目前，提高向量搜索的精度，主要可以通过几种途径：
+![](/imgs/datasetSetting1.png)
 
-1. 精简`q`的内容，减少向量内容的长度：当`q`的内容更少，更准确时，检索精度自然会提高。但与此同时，会牺牲一定的检索范围，适合答案较为严格的场景。
-2. 更好分词分段：当一段话的结构和语义是完整的，并且是单一的，精度也会提高。因此，许多系统都会优化分词器，尽可能的保障每组数据的完整性。
-3. 多样性文本：为一段内容增加关键词、摘要、相似问题等描述性信息，可以使得该内容的向量具有更大的检索覆盖范围。
+## 多向量的目的和使用方式
+
+在一组数据中，如果我们希望它尽可能长，但语义又要在向量中尽可能提现，则没有办法通过一组向量来表示。因此，我们采用了多向量映射的方式，将一组数据映射到多组向量中，从而保障数据的完整性和语义的提现。
+
+你可以为一组较长的文本，添加多组向量，从而在检索时，只要其中一组向量被检索到，该数据也将被召回。
+
+## 提高向量搜索精度的方法
+
+1. 更好分词分段：当一段话的结构和语义是完整的，并且是单一的，精度也会提高。因此，许多系统都会优化分词器，尽可能的保障每组数据的完整性。
+2. 精简`index`的内容，减少向量内容的长度：当`index`的内容更少，更准确时，检索精度自然会提高。但与此同时，会牺牲一定的检索范围，适合答案较为严格的场景。
+3. 丰富`index`的数量，可以为同一个`chunk`内容增加多组`index`。
 4. 优化检索词：在实际使用过程中，用户的问题通常是模糊的或是缺失的，并不一定是完整清晰的问题。因此优化用户的问题（检索词）很大程度上也可以提高精度。
 5. 微调向量模型：由于市面上直接使用的向量模型都是通用型模型，在特定领域的检索精度并不高，因此微调向量模型可以很大程度上提高专业领域的检索效果。
 
 
@@ -63,8 +63,8 @@ export const splitText2Chunks = (props: { text: string; maxLen: number; overlapL
     let chunks: string[] = [];
     for (let i = 0; i < splitTexts.length; i++) {
       let text = splitTexts[i];
-      let chunkToken = countPromptTokens(lastChunk, '');
-      const textToken = countPromptTokens(text, '');
+      let chunkToken = lastChunk.length;
+      const textToken = text.length;
 
       // next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
       if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
 
@@ -1,4 +1,4 @@
-import type { VectorModelItemType } from '../../core/ai/model.d';
+import type { LLMModelItemType, VectorModelItemType } from '../../core/ai/model.d';
 import { PermissionTypeEnum } from '../../support/permission/constant';
 import { PushDatasetDataChunkProps } from './api';
 import {
@@ -19,6 +19,7 @@ export type DatasetSchemaType = {
   avatar: string;
   name: string;
   vectorModel: string;
+  agentModel: string;
   tags: string[];
   type: `${DatasetTypeEnum}`;
   permission: `${PermissionTypeEnum}`;
@@ -84,8 +85,9 @@ export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datas
 };
 
 /* ================= dataset ===================== */
-export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel'> & {
+export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
   vectorModel: VectorModelItemType;
+  agentModel: LLMModelItemType;
   isOwner: boolean;
   canWrite: boolean;
 };
 
@@ -3,6 +3,8 @@ import { BillListItemType } from './type';
 
 export type CreateTrainingBillProps = {
   name: string;
+  vectorModel?: string;
+  agentModel?: string;
 };
 
 export type ConcatBillProps = {
 
@@ -61,7 +61,6 @@ const AppSchema = new Schema({
 
 try {
   AppSchema.index({ updateTime: -1 });
-  AppSchema.index({ 'share.collection': -1 });
 } catch (error) {
   console.log(error);
 }
 
@@ -69,7 +69,6 @@ const DatasetCollectionSchema = new Schema({
 
 try {
   DatasetCollectionSchema.index({ datasetId: 1 });
-  DatasetCollectionSchema.index({ userId: 1 });
   DatasetCollectionSchema.index({ updateTime: -1 });
 } catch (error) {
   console.log(error);
 
@@ -48,6 +48,11 @@ const DatasetSchema = new Schema({
     required: true,
     default: 'text-embedding-ada-002'
   },
+  agentModel: {
+    type: String,
+    required: true,
+    default: 'gpt-3.5-turbo-16k'
+  },
   type: {
     type: String,
     enum: Object.keys(DatasetTypeMap),
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,6 @@ const AppSchema = new Schema({`
`61`	`61`
`62`	`62`	`try {`
`63`	`63`	`AppSchema.index({ updateTime: -1 });`
`64`		`- AppSchema.index({ 'share.collection': -1 });`
`65`	`64`	`} catch (error) {`
`66`	`65`	`console.log(error);`
`67`	`66`	`}`