22
22
@dataclass
23
23
class ConversionStatus :
24
24
"""Track the status of a PDF to Markdown conversion."""
25
+
25
26
paper_id : str
26
27
status : str # 'downloading', 'converting', 'success', 'error'
27
28
started_at : datetime
@@ -37,16 +38,16 @@ class ConversionStatus:
37
38
"properties" : {
38
39
"paper_id" : {
39
40
"type" : "string" ,
40
- "description" : "The arXiv ID of the paper to download"
41
+ "description" : "The arXiv ID of the paper to download" ,
41
42
},
42
43
"check_status" : {
43
44
"type" : "boolean" ,
44
45
"description" : "If true, only check conversion status without downloading" ,
45
- "default" : False
46
- }
46
+ "default" : False ,
47
+ },
47
48
},
48
- "required" : ["paper_id" ]
49
- }
49
+ "required" : ["paper_id" ],
50
+ },
50
51
)
51
52
52
53
@@ -63,19 +64,19 @@ def convert_pdf_to_markdown(paper_id: str, pdf_path: Path) -> None:
63
64
logger .info (f"Starting conversion for { paper_id } " )
64
65
markdown = pymupdf4llm .to_markdown (pdf_path , show_progress = False )
65
66
md_path = get_paper_path (paper_id , ".md" )
66
-
67
+
67
68
with open (md_path , "w" , encoding = "utf-8" ) as f :
68
69
f .write (markdown )
69
70
70
71
status = conversion_statuses .get (paper_id )
71
72
if status :
72
73
status .status = "success"
73
74
status .completed_at = datetime .now ()
74
-
75
+
75
76
# Clean up PDF after successful conversion
76
77
pdf_path .unlink ()
77
78
logger .info (f"Conversion completed for { paper_id } " )
78
-
79
+
79
80
except Exception as e :
80
81
logger .error (f"Conversion failed for { paper_id } : { str (e )} " )
81
82
status = conversion_statuses .get (paper_id )
@@ -90,108 +91,137 @@ async def handle_download(arguments: Dict[str, Any]) -> List[types.TextContent]:
90
91
try :
91
92
paper_id = arguments ["paper_id" ]
92
93
check_status = arguments .get ("check_status" , False )
93
-
94
+
94
95
# If only checking status
95
96
if check_status :
96
97
status = conversion_statuses .get (paper_id )
97
98
if not status :
98
99
if get_paper_path (paper_id , ".md" ).exists ():
99
- return [types .TextContent (
100
+ return [
101
+ types .TextContent (
102
+ type = "text" ,
103
+ text = json .dumps (
104
+ {
105
+ "status" : "success" ,
106
+ "message" : "Paper is ready" ,
107
+ "resource_uri" : f"file://{ get_paper_path (paper_id , '.md' )} " ,
108
+ }
109
+ ),
110
+ )
111
+ ]
112
+ return [
113
+ types .TextContent (
100
114
type = "text" ,
101
- text = json .dumps ({
102
- "status" : "success" ,
103
- "message" : "Paper is ready" ,
104
- "resource_uri" : f"file://{ get_paper_path (paper_id , '.md' )} "
105
- })
106
- )]
107
- return [types .TextContent (
115
+ text = json .dumps (
116
+ {
117
+ "status" : "unknown" ,
118
+ "message" : "No download or conversion in progress" ,
119
+ }
120
+ ),
121
+ )
122
+ ]
123
+
124
+ return [
125
+ types .TextContent (
108
126
type = "text" ,
109
- text = json .dumps ({
110
- "status" : "unknown" ,
111
- "message" : "No download or conversion in progress"
112
- })
113
- )]
114
-
115
- return [types .TextContent (
116
- type = "text" ,
117
- text = json .dumps ({
118
- "status" : status .status ,
119
- "started_at" : status .started_at .isoformat (),
120
- "completed_at" : status .completed_at .isoformat () if status .completed_at else None ,
121
- "error" : status .error ,
122
- "message" : f"Paper conversion { status .status } "
123
- })
124
- )]
125
-
127
+ text = json .dumps (
128
+ {
129
+ "status" : status .status ,
130
+ "started_at" : status .started_at .isoformat (),
131
+ "completed_at" : (
132
+ status .completed_at .isoformat ()
133
+ if status .completed_at
134
+ else None
135
+ ),
136
+ "error" : status .error ,
137
+ "message" : f"Paper conversion { status .status } " ,
138
+ }
139
+ ),
140
+ )
141
+ ]
142
+
126
143
# Check if paper is already converted
127
144
if get_paper_path (paper_id , ".md" ).exists ():
128
- return [types .TextContent (
129
- type = "text" ,
130
- text = json .dumps ({
131
- "status" : "success" ,
132
- "message" : "Paper already available" ,
133
- "resource_uri" : f"file://{ get_paper_path (paper_id , '.md' )} "
134
- })
135
- )]
136
-
145
+ return [
146
+ types .TextContent (
147
+ type = "text" ,
148
+ text = json .dumps (
149
+ {
150
+ "status" : "success" ,
151
+ "message" : "Paper already available" ,
152
+ "resource_uri" : f"file://{ get_paper_path (paper_id , '.md' )} " ,
153
+ }
154
+ ),
155
+ )
156
+ ]
157
+
137
158
# Check if already in progress
138
159
if paper_id in conversion_statuses :
139
160
status = conversion_statuses [paper_id ]
140
- return [types .TextContent (
141
- type = "text" ,
142
- text = json .dumps ({
143
- "status" : status .status ,
144
- "message" : f"Paper conversion { status .status } " ,
145
- "started_at" : status .started_at .isoformat ()
146
- })
147
- )]
148
-
161
+ return [
162
+ types .TextContent (
163
+ type = "text" ,
164
+ text = json .dumps (
165
+ {
166
+ "status" : status .status ,
167
+ "message" : f"Paper conversion { status .status } " ,
168
+ "started_at" : status .started_at .isoformat (),
169
+ }
170
+ ),
171
+ )
172
+ ]
173
+
149
174
# Start new download and conversion
150
175
pdf_path = get_paper_path (paper_id , ".pdf" )
151
176
client = arxiv .Client ()
152
-
177
+
153
178
# Initialize status
154
179
conversion_statuses [paper_id ] = ConversionStatus (
155
- paper_id = paper_id ,
156
- status = "downloading" ,
157
- started_at = datetime .now ()
180
+ paper_id = paper_id , status = "downloading" , started_at = datetime .now ()
158
181
)
159
-
182
+
160
183
# Download PDF
161
184
paper = next (client .results (arxiv .Search (id_list = [paper_id ])))
162
185
paper .download_pdf (dirpath = pdf_path .parent , filename = pdf_path .name )
163
-
186
+
164
187
# Update status and start conversion
165
188
status = conversion_statuses [paper_id ]
166
189
status .status = "converting"
167
-
190
+
168
191
# Start conversion in thread
169
192
asyncio .create_task (
170
193
asyncio .to_thread (convert_pdf_to_markdown , paper_id , pdf_path )
171
194
)
172
-
173
- return [types .TextContent (
174
- type = "text" ,
175
- text = json .dumps ({
176
- "status" : "converting" ,
177
- "message" : "Paper downloaded, conversion started" ,
178
- "started_at" : status .started_at .isoformat ()
179
- })
180
- )]
181
-
195
+
196
+ return [
197
+ types .TextContent (
198
+ type = "text" ,
199
+ text = json .dumps (
200
+ {
201
+ "status" : "converting" ,
202
+ "message" : "Paper downloaded, conversion started" ,
203
+ "started_at" : status .started_at .isoformat (),
204
+ }
205
+ ),
206
+ )
207
+ ]
208
+
182
209
except StopIteration :
183
- return [types .TextContent (
184
- type = "text" ,
185
- text = json .dumps ({
186
- "status" : "error" ,
187
- "message" : f"Paper { paper_id } not found on arXiv"
188
- })
189
- )]
210
+ return [
211
+ types .TextContent (
212
+ type = "text" ,
213
+ text = json .dumps (
214
+ {
215
+ "status" : "error" ,
216
+ "message" : f"Paper { paper_id } not found on arXiv" ,
217
+ }
218
+ ),
219
+ )
220
+ ]
190
221
except Exception as e :
191
- return [types .TextContent (
192
- type = "text" ,
193
- text = json .dumps ({
194
- "status" : "error" ,
195
- "message" : f"Error: { str (e )} "
196
- })
197
- )]
222
+ return [
223
+ types .TextContent (
224
+ type = "text" ,
225
+ text = json .dumps ({"status" : "error" , "message" : f"Error: { str (e )} " }),
226
+ )
227
+ ]
0 commit comments