@@ -204,12 +204,21 @@ def _annotate_with_smart_cascade(
204
204
Annotations from the first engine that finds sufficient entities
205
205
"""
206
206
# Stage 1: Try regex first (fastest)
207
- regex_result = self .regex_annotator .annotate (text )
208
- if self ._cascade_should_stop ("regex" , regex_result ):
209
- if structured :
210
- _ , result = self .regex_annotator .annotate_with_spans (text )
207
+ if structured :
208
+ # For structured output, use annotate_with_spans directly to avoid double processing
209
+ _ , result = self .regex_annotator .annotate_with_spans (text )
210
+ regex_result = {}
211
+ for span in result .spans :
212
+ if span .label not in regex_result :
213
+ regex_result [span .label ] = []
214
+ regex_result [span .label ].append (span .text )
215
+
216
+ if self ._cascade_should_stop ("regex" , regex_result ):
211
217
return result .spans
212
- return regex_result
218
+ else :
219
+ regex_result = self .regex_annotator .annotate (text )
220
+ if self ._cascade_should_stop ("regex" , regex_result ):
221
+ return regex_result
213
222
214
223
# Stage 2: Try GLiNER (balanced speed/accuracy)
215
224
if self .gliner_annotator is not None :
@@ -232,6 +241,97 @@ def _annotate_with_smart_cascade(
232
241
return result .spans
233
242
return regex_result
234
243
244
+ def _annotate_single_chunk (
245
+ self , text : str , structured : bool = False
246
+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
247
+ """Annotate a single chunk of text based on the engine type."""
248
+ if self .engine == "regex" :
249
+ if structured :
250
+ _ , result = self .regex_annotator .annotate_with_spans (text )
251
+ return result .spans
252
+ return self .regex_annotator .annotate (text )
253
+ elif self .engine == "spacy" :
254
+ if self .spacy_annotator is None :
255
+ raise ImportError (
256
+ "SpaCy engine not available. Install with: pip install datafog[nlp]"
257
+ )
258
+ return self .spacy_annotator .annotate (text )
259
+ elif self .engine == "gliner" :
260
+ if self .gliner_annotator is None :
261
+ raise ImportError (
262
+ "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
263
+ )
264
+ return self .gliner_annotator .annotate (text )
265
+ elif self .engine == "smart" :
266
+ return self ._annotate_with_smart_cascade (text , structured )
267
+ elif self .engine == "auto" :
268
+ return self ._annotate_with_auto_engine (text , structured )
269
+
270
+ def _annotate_with_auto_engine (
271
+ self , text : str , structured : bool = False
272
+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
273
+ """Handle auto engine annotation with regex fallback to spacy."""
274
+ # Try regex first
275
+ if structured :
276
+ # For structured output, use annotate_with_spans directly to avoid double processing
277
+ _ , result = self .regex_annotator .annotate_with_spans (text )
278
+ regex_result = {}
279
+ for span in result .spans :
280
+ if span .label not in regex_result :
281
+ regex_result [span .label ] = []
282
+ regex_result [span .label ].append (span .text )
283
+
284
+ # Check if regex found any entities
285
+ if any (entities for entities in regex_result .values ()):
286
+ return result .spans
287
+ else :
288
+ regex_result = self .regex_annotator .annotate (text )
289
+
290
+ # Check if regex found any entities
291
+ if any (entities for entities in regex_result .values ()):
292
+ return regex_result
293
+
294
+ # Fall back to spacy if available
295
+ if self .spacy_annotator is not None :
296
+ return self .spacy_annotator .annotate (text )
297
+
298
+ # Return regex result even if empty
299
+ if structured :
300
+ # We already have the result from above in structured mode
301
+ return result .spans
302
+ return regex_result
303
+
304
+ def _annotate_multiple_chunks_structured (self , chunks : List [str ]) -> List ["Span" ]:
305
+ """Handle structured annotation across multiple chunks."""
306
+ all_spans = []
307
+ current_offset = 0
308
+
309
+ # Get Span class once outside the loop for efficiency
310
+ SpanClass = _get_span_class ()
311
+
312
+ for chunk in chunks :
313
+ chunk_spans = self ._annotate_single_chunk (chunk , structured = True )
314
+ # Adjust span positions to account for chunk offset
315
+ for span in chunk_spans :
316
+ adjusted_span = SpanClass (
317
+ start = span .start + current_offset ,
318
+ end = span .end + current_offset ,
319
+ text = span .text ,
320
+ label = span .label ,
321
+ )
322
+ all_spans .append (adjusted_span )
323
+ current_offset += len (chunk )
324
+
325
+ return all_spans
326
+
327
+ def _annotate_multiple_chunks_dict (self , chunks : List [str ]) -> Dict [str , List [str ]]:
328
+ """Handle dictionary annotation across multiple chunks."""
329
+ chunk_annotations = []
330
+ for chunk in chunks :
331
+ chunk_result = self ._annotate_single_chunk (chunk , structured = False )
332
+ chunk_annotations .append (chunk_result )
333
+ return self ._combine_annotations (chunk_annotations )
334
+
235
335
def annotate_text_sync (
236
336
self , text : str , structured : bool = False
237
337
) -> Union [Dict [str , List [str ]], List ["Span" ]]:
@@ -247,76 +347,15 @@ def annotate_text_sync(
247
347
"""
248
348
if len (text ) <= self .text_chunk_length :
249
349
# Single chunk processing
250
- if self .engine == "regex" :
251
- if structured :
252
- _ , result = self .regex_annotator .annotate_with_spans (text )
253
- return result .spans
254
- return self .regex_annotator .annotate (text )
255
- elif self .engine == "spacy" :
256
- if self .spacy_annotator is None :
257
- raise ImportError (
258
- "SpaCy engine not available. Install with: pip install datafog[nlp]"
259
- )
260
- return self .spacy_annotator .annotate (text )
261
- elif self .engine == "gliner" :
262
- if self .gliner_annotator is None :
263
- raise ImportError (
264
- "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
265
- )
266
- return self .gliner_annotator .annotate (text )
267
- elif self .engine == "smart" :
268
- return self ._annotate_with_smart_cascade (text , structured )
269
- elif self .engine == "auto" :
270
- # Try regex first
271
- regex_result = self .regex_annotator .annotate (text )
272
-
273
- # Check if regex found any entities
274
- if any (entities for entities in regex_result .values ()):
275
- if structured :
276
- _ , result = self .regex_annotator .annotate_with_spans (text )
277
- return result .spans
278
- return regex_result
279
-
280
- # Fall back to spacy if available
281
- if self .spacy_annotator is not None :
282
- return self .spacy_annotator .annotate (text )
283
-
284
- # Return regex result even if empty
285
- if structured :
286
- _ , result = self .regex_annotator .annotate_with_spans (text )
287
- return result .spans
288
- return regex_result
350
+ return self ._annotate_single_chunk (text , structured )
289
351
else :
290
352
# Multi-chunk processing
291
353
chunks = self ._chunk_text (text )
292
354
293
355
if structured :
294
- # For structured output, we need to handle span positions across chunks
295
- all_spans = []
296
- current_offset = 0
297
-
298
- for chunk in chunks :
299
- chunk_spans = self .annotate_text_sync (chunk , structured = True )
300
- # Adjust span positions to account for chunk offset
301
- for span in chunk_spans :
302
- SpanClass = _get_span_class ()
303
- adjusted_span = SpanClass (
304
- start = span .start + current_offset ,
305
- end = span .end + current_offset ,
306
- text = span .text ,
307
- label = span .label ,
308
- )
309
- all_spans .append (adjusted_span )
310
- current_offset += len (chunk )
311
-
312
- return all_spans
356
+ return self ._annotate_multiple_chunks_structured (chunks )
313
357
else :
314
- # Dictionary format - combine annotations
315
- chunk_annotations = []
316
- for chunk in chunks :
317
- chunk_result = self .annotate_text_sync (chunk , structured = False )
318
- chunk_annotations .append (chunk_result )
319
- return self ._combine_annotations (chunk_annotations )
358
+ return self ._annotate_multiple_chunks_dict (chunks )
320
359
321
360
async def annotate_text_async (
322
361
self , text : str , structured : bool = False
0 commit comments