9
9
from fastapi .exceptions import HTTPException
10
10
from fastapi .middleware .cors import CORSMiddleware
11
11
12
- from services import identify_file_type , merge_pdfs , save_file , extract_pdf_text_searchable , get_file_size , extract_image_text
12
+ from services import identify_file_type , merge_pdfs , save_file
13
13
from service_wrappers import extract_image_text_and_set_db , extract_pdf_text_and_set_db
14
14
from textract_wrapper import detect_text_and_set_db
15
- from text_analysis import analyze
16
15
from tasks import enqueue_extraction
17
16
from db import set_object , get_object
18
17
@@ -38,18 +37,6 @@ def root():
38
37
return "Document Processing"
39
38
40
39
41
- @app .post ("/content-type" )
42
- def identify_content_type (attachment : UploadFile ):
43
- # Identify the file mime type.
44
- filename = f"/media/content-type-identification/{ attachment .filename } "
45
- save_file (attachment .file , filename )
46
- # We read through the file in the last step, i.e save_file().
47
- # We must seek(0), and go to the beginning before trying to identify the file type.
48
- attachment .file .seek (0 )
49
- file_type = identify_file_type (attachment .file )
50
- return {"content-type" : file_type .mime_type }
51
-
52
-
53
40
@app .post ("/pdfs-merge" )
54
41
def pdfs_merge (attachments : List [UploadFile ]):
55
42
"""
@@ -75,48 +62,6 @@ def pdfs_merge(attachments: List[UploadFile]):
75
62
return {"status" : "processed" , "filename" : merged_filename }
76
63
77
64
78
- @app .post ("/extract-pdf-text" )
79
- def extract_text (attachment : UploadFile ):
80
- """
81
- Extracts text from an attachment uploaded through multipart/form-data.
82
- """
83
- type_details = identify_file_type (attachment .file )
84
- if type_details .mime_type != 'application/pdf' :
85
- raise HTTPException (status_code = 400 , detail = "A non-pdf file found." )
86
- attachment_name = attachment .filename
87
- output_filename = f"/media/extraction-pdfs/{ attachment_name } "
88
- save_file (attachment .file , output_filename )
89
- attachment .file .seek (0 )
90
- is_success , content = extract_pdf_text_searchable (attachment .file )
91
- if is_success is False :
92
- raise HTTPException (status_code = 400 , detail = content )
93
- analysis_result = analyze (content )
94
- return {"content" : content , "analysis_result" : analysis_result }
95
-
96
-
97
- @app .post ("/extract-image-text" )
98
- def extract_img_text (attachment : UploadFile ):
99
- """
100
- Perform OCR on the uploaded attachment.
101
- Currently works with images having text.
102
- Later add support for PDFs and Docx as well.
103
- """
104
- type_details = identify_file_type (attachment .file )
105
- if not type_details .mime_type .startswith ('image' ):
106
- raise HTTPException (status_code = 400 , detail = "A non image file found." )
107
- file_size = get_file_size (attachment .file )
108
- # 100 MB
109
- if file_size > (10 * 1024 * 1024 ):
110
- raise HTTPException (status_code = 400 , detail = "Only supports upto 10MB files." )
111
- output_filename = f"/media/extraction-images/{ attachment .filename } "
112
- attachment .file .seek (0 )
113
- save_file (attachment .file , output_filename )
114
- is_success , content = extract_image_text (output_filename )
115
- if is_success is False :
116
- raise HTTPException (status_code = 400 , detail = content )
117
- return {"content" : content }
118
-
119
-
120
65
@app .post ("/ocr" )
121
66
def ocr (attachment : UploadFile , gray : bool = Form (True ), denoise : bool = Form (True ), binarize : bool = Form (True )):
122
67
"""
0 commit comments