Skip to content

Commit 28ff3b2

Browse files
authored
Chat index (intel#1073)
1 parent 2b34485 commit 28ff3b2

File tree

3 files changed

+339
-0
lines changed

3 files changed

+339
-0
lines changed
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
import os
2+
import argparse
3+
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
4+
from langchain.vectorstores import Chroma
5+
from chromadb.utils import embedding_functions
6+
from langchain.docstore.document import Document
7+
import re, json
8+
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
9+
from langchain.document_loaders import TextLoader, UnstructuredMarkdownLoader
10+
import PyPDF2
11+
from haystack.schema import Document as SDocument
12+
from docx import Document as DDocument
13+
from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore
14+
import pandas as pd
15+
16+
17+
def split_paragraph(text, jsonl_name, max_length=378):
18+
new_sens = []
19+
documents = []
20+
for sub in text:
21+
sub['doc'].replace('#', " ")
22+
sub['doc'] = re.sub(r'\s+', ' ', sub['doc'])
23+
new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']})
24+
documents.append(new_doc)
25+
return documents
26+
27+
28+
## indexing for jsonl file
29+
def d_load_jsonl_file(file_path, process, max_length=378):
30+
data = []
31+
with open(file_path, 'r') as file:
32+
for line in file:
33+
json_obj = json.loads(line)
34+
data.append(json_obj)
35+
36+
new_sens = []
37+
documents = []
38+
paragraphs = []
39+
for sub in data:
40+
sub['doc'].replace('#', " ")
41+
if not process:
42+
sub['doc'] = re.sub(r'\s+', ' ', sub['doc'])
43+
new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']})
44+
documents.append(new_doc)
45+
else:
46+
for sub in data:
47+
sub['doc'].replace('#', " ")
48+
split_sen = re.split(r'[.?!]', sub['doc'])
49+
for num in range(len(split_sen)):
50+
split_sen[num] = re.sub(r'\s+', ' ', split_sen[num])
51+
if num+1 < len(split_sen):
52+
if len(split_sen[num])>max_length:
53+
new_sens.append(split_sen[num].strip())
54+
else:
55+
split_sen[num+1]=split_sen[num]+split_sen[num+1]
56+
else:
57+
new_sens.append(split_sen[num])
58+
59+
print("length for origin", len(new_sens))
60+
paragraphs = list(set(new_sens))
61+
print("length for processed", len(new_sens))
62+
documents = []
63+
metadata = {"source": file_path}
64+
for paragraph in paragraphs:
65+
new_doc = Document(page_content=paragraph, metadata=metadata)
66+
documents.append(new_doc)
67+
return documents
68+
69+
70+
# def d_load_xlsx_file(file_path, process, max_length=378):
71+
# data = []
72+
# data = pd.read_excel(file_path)
73+
#
74+
# new_sens = []
75+
# documents = []
76+
# paragraphs = []
77+
# for sub in data:
78+
# sub['doc'].replace('#', " ")
79+
# if not process:
80+
# sub['doc'] = re.sub(r'\s+', ' ', sub['doc'])
81+
# new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']})
82+
# documents.append(new_doc)
83+
84+
## indexing for pdf file
85+
def d_load_file(file_path, process, max_length=378):
86+
if file_path.endswith("pdf"):
87+
text = load_pdf(file_path)
88+
elif file_path.endswith("docx"):
89+
text = read_docx(file_path)
90+
91+
text = text.replace('\n', '')
92+
text = text.replace('\n\n', '')
93+
text = re.sub(r'\s+', ' ', text)
94+
"""
95+
split the document
96+
"""
97+
sentences = re.split('(?<=[;!.?])', text)
98+
99+
new_sents = []
100+
for i in range(int(len(sentences) / 2)):
101+
sent = sentences[2 * i] + sentences[2 * i + 1]
102+
new_sents.append(sent)
103+
if len(sentences) % 2 == 1:
104+
new_sents.append(sentences[len(sentences) - 1])
105+
106+
paragraphs = []
107+
current_length = 0
108+
current_paragraph = ""
109+
for sentence in new_sents:
110+
sentence_length = len(sentence)
111+
if current_length + sentence_length <= max_length:
112+
current_paragraph += sentence
113+
current_length += sentence_length
114+
else:
115+
paragraphs.append(current_paragraph.strip())
116+
current_paragraph = sentence
117+
current_length = sentence_length
118+
print("length for origin", len(paragraphs))
119+
paragraphs.append(current_paragraph.strip())
120+
paragraphs = list(set(paragraphs))
121+
print("length for processed", len(paragraphs))
122+
documents = []
123+
metadata = {"source": file_path}
124+
for paragraph in paragraphs:
125+
new_doc = Document(page_content=paragraph, metadata=metadata)
126+
documents.append(new_doc)
127+
return documents
128+
129+
130+
### Load with spare embedding for jsonl file
131+
def s_load_jsonl_file(file_path, process, document_store, max_length=378):
132+
data = []
133+
with open(file_path, 'r') as file:
134+
for line in file:
135+
json_obj = json.loads(line)
136+
data.append(json_obj)
137+
138+
new_sens = []
139+
documents = []
140+
paragraphs = []
141+
for sub in data:
142+
sub['doc'].replace('#', " ")
143+
if not process:
144+
sub['doc'] = re.sub(r'\s+', ' ', sub['doc'])
145+
new_doc = SDocument(content=sub['doc'], meta={"source": sub['doc_id']})
146+
documents.append(new_doc)
147+
else:
148+
for sub in data:
149+
sub['doc'].replace('#', " ")
150+
split_sen = re.split(r'[.?!]', sub['doc'])
151+
for num in range(len(split_sen)):
152+
split_sen[num] = re.sub(r'\s+', ' ', split_sen[num])
153+
if num+1 < len(split_sen):
154+
if len(split_sen[num])>max_length:
155+
new_sens.append(split_sen[num].strip())
156+
else:
157+
split_sen[num+1]=split_sen[num]+split_sen[num+1]
158+
else:
159+
new_sens.append(split_sen[num])
160+
161+
print("length for origin", len(new_sens))
162+
paragraphs = list(set(new_sens))
163+
print("length for processed", len(new_sens))
164+
documents = []
165+
metadata = {"source": file_path}
166+
for paragraph in paragraphs:
167+
new_doc = SDocument(content=paragraph, meta=metadata)
168+
documents.append(new_doc)
169+
document_store.write_documents(documents)
170+
return document_store
171+
172+
173+
### Load with spare embedding for pdf file
174+
def s_load_file(file_path, process, document_store, max_length=378):
175+
if file_path.endswith("pdf"):
176+
text = load_pdf(file_path)
177+
elif file_path.endswith("docx"):
178+
text = read_docx(file_path)
179+
180+
text = text.replace('\n', '')
181+
text = text.replace('\n\n', '')
182+
text = re.sub(r'\s+', ' ', text)
183+
"""
184+
split the document
185+
"""
186+
sentences = re.split('(?<=[;!.?])', text)
187+
188+
new_sents = []
189+
for i in range(int(len(sentences) / 2)):
190+
sent = sentences[2 * i] + sentences[2 * i + 1]
191+
new_sents.append(sent.strip())
192+
if len(sentences) % 2 == 1:
193+
new_sents.append(sentences[len(sentences) - 1])
194+
195+
paragraphs = []
196+
current_length = 0
197+
current_paragraph = ""
198+
for sentence in new_sents:
199+
sentence_length = len(sentence)
200+
if current_length + sentence_length <= max_length:
201+
current_paragraph += sentence
202+
current_length += sentence_length
203+
else:
204+
paragraphs.append(current_paragraph.strip())
205+
current_paragraph = sentence
206+
current_length = sentence_length
207+
print("length for origin", len(paragraphs))
208+
paragraphs.append(current_paragraph.strip())
209+
paragraphs = list(set(paragraphs))
210+
print("length for processed", len(paragraphs))
211+
documents = []
212+
metadata = {"source": file_path}
213+
for paragraph in paragraphs:
214+
new_doc = SDocument(content=paragraph, metadata=metadata)
215+
documents.append(new_doc)
216+
document_store.write_documents(documents)
217+
218+
return document_store
219+
220+
221+
def persist_embedding(documents, persist_directory, model_path):
222+
## persistly save the local file into disc
223+
embedding = HuggingFaceInstructEmbeddings(model_name=model_path)
224+
vectordb = Chroma.from_documents(documents=documents, embedding=embedding, persist_directory=persist_directory)
225+
vectordb.persist()
226+
vectordb = None
227+
228+
229+
def read_docx(doc_path):
230+
doc = DDocument(doc_path)
231+
text = ''
232+
for paragraph in doc.paragraphs:
233+
text += paragraph.text
234+
return text
235+
236+
def load_pdf(pdf_path):
237+
pdf_file = open(pdf_path, 'rb')
238+
pdf_reader = PyPDF2.PdfReader(pdf_file)
239+
240+
text = ''
241+
for num in range(len(pdf_reader.pages)):
242+
page = pdf_reader.pages[num]
243+
text += page.extract_text()
244+
return text
245+
246+
247+
if __name__ == "__main__":
248+
249+
parser = argparse.ArgumentParser()
250+
parser.add_argument('--file_path', type=str, help='The user upload file.',
251+
default="/data1/lkk/llm_inference/chat-langchain/inc_documents_formated.jsonl")
252+
parser.add_argument('--process', type=bool,
253+
help='Whether or not to proceed the load content.',
254+
default=False)
255+
parser.add_argument('--embedding_model', type=str, help='Select which model to embed the content.', default='/data1/lkk/instructor_large/')
256+
parser.add_argument('--output_path', type=str, help='Where to save the embedding.', default='db_jsonl122')
257+
parser.add_argument('--embedding_method', type=str, help='Select to use dense retrieval or sparse retrieval.', default='dense')
258+
parser.add_argument('--store', type=str, help='Select to use dense retrieval or sparse retrieval.',
259+
default='dense')
260+
261+
args = parser.parse_args()
262+
263+
if args.embedding_method == "dense": # currently use Chroma as the dense retrieval datastore
264+
if args.file_path.endswith("jsonl"):
265+
documents = d_load_jsonl_file(args.file_path, args.process)
266+
elif args.file_path.endswith("pdf") or args.file_path.endswith("docx"):
267+
documents = d_load_file(args.file_path, args.process)
268+
else:
269+
print("{} is ignored. Will support this file format soon.".format(args.file_path))
270+
persist_embedding(documents, args.output_path, args.embedding_model)
271+
elif args.embedding_method == "sparse": # sparse retrieval datastores has inmemory and Elasticsearch
272+
if args.store == "inmemory":
273+
document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)
274+
elif args.store == "Elasticsearch":
275+
document_store = ElasticsearchDocumentStore(host="localhost", index="elastic_index_1",
276+
port=9200, search_fields=["content", "title"])
277+
# import pdb;pdb.set_trace()
278+
if args.file_path.endswith("jsonl"):
279+
document_store = s_load_jsonl_file(args.file_path, args.process, document_store)
280+
elif args.file_path.endswith("pdf") or args.file_path.endswith("docx"):
281+
document_store = s_load_file(args.file_path, args.process, document_store)
282+
else:
283+
print("{} is ignored. Will support this file format soon.".format(args.file_path))
284+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
Document Indexing
2+
======
3+
1. [Introduction](#introduction)
4+
2. [Get Started](#get-started)
5+
6+
## Introduction
7+
8+
Document indexing serves the purpose of assisting users in parsing locally uploaded files and storing them in a document store for future content retrieval. We have designed two separate indexing methods: sparse retrieval and dense retrieval.
9+
10+
Sparse Retrieval (SR) involves projecting the content into a sparse vector that closely aligns with the vocabulary of the content's language. This can be achieved through traditional Bag-of-Words techniques like TF-IDF or BM25.
11+
12+
On the other hand, Dense Retrieval (DR) encodes the content as one or more dense vectors. Users have the option to specify a local pretrained model or utilize a GPT model from OpenAI to obtain the embeddings of the uploaded content. The choice between sparse retrieval and dense retrieval depends on the specific requirements of the individual.
13+
14+
Our repository currently supports three document stores: `In Memory` and `Elasticsearch` for sparse retrieval, and `Chroma` for dense retrieval. Each document store has its own unique characteristics. The selection of a document store should be based on the maturity of your project, the intended use case, and the technical environment.
15+
16+
17+
|Document store |Main features |Platform |
18+
|:----------|:----------|:------------------|
19+
|Elasticsearch |Sparse retrieval with many tuning options and basic support for dense retrieval. |Haystack|
20+
|In Memory|Simple document store, with no extra services or dependencies. Not recommended for production. |Haystack | |
21+
|Chroma |Focus on dense retrieval. Easy to use. Lightwieght and fast for retrieval. |LangChain|
22+
23+
The support for other document stores will be available soon.
24+
25+
Right now, we support the user to upload the file in the PDF format and jsonl format. After the indexing work, the user can easily edit the local document store to add or delete a specific file.
26+
27+
## Get Started
28+
29+
### Sparse Indexing
30+
31+
When it comes to sparse indexing, the process of parsing a local file into the desired document store is straightforward for users. They simply need to provide the file path using the `--file_path` parameter and choose an appropriate local document store using the `--store parameter`.
32+
33+
However, it's important to mention that the `In Memory` method does not support local database storage. It requires users to perform document processing every time they use it, as the documents are not persistently stored.
34+
35+
```bash
36+
python doc_index.py --file_path "xxx" --output_path "xxx" --embedding_method sparse --store Elasticsearch
37+
```
38+
39+
### Dense Indexing
40+
When it comes to dense indexing, users have the flexibility to choose their preferred pretrained encoder model for the process. In the given use case, we utilize the `instructor-large` model with the `HuggingFaceInstructEmbeddings` API. Users can select a suitable model from the [text embedding benchmark leaderboard](https://huggingface.co/spaces/mteb/leaderboard). We support both local models and models available through the HuggingFace library. To use a specific model, users can provide the model name.
41+
42+
Alternatively, users can also utilize GPT models from OpenAI. To incorporate a GPT model into the process, minor adjustments need to be made to the following code:
43+
```python
44+
from langchain.embeddings import OpenAIEmbeddings
45+
embeddings = OpenAIEmbeddings()
46+
```
47+
48+
The user can start the dense indexing with,
49+
```bash
50+
python doc_index.py --file_path "xxx" --output_path "xxx" --embedding_model hkunlp/instructor-large --embedding_method dense --store Chroma
51+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
langchain
2+
chromadb
3+
PyPDF2
4+
haystack

0 commit comments

Comments
 (0)