@@ -147,7 +147,7 @@ def process_byte_based_chunk(
147147 return actual , b"" , total_read , False
148148 return None , prev_overlap , total_read - len (chunk ), False
149149 current = actual [:chunk_size ]
150- next_overlap = actual [chunk_size - overlap : chunk_size ]
150+ next_overlap = actual [chunk_size - overlap : chunk_size ]
151151 return current , next_overlap , total_read , False
152152
153153
@@ -214,17 +214,14 @@ def split_file(
214214 total = file_size + (num_chunks - 1 ) * overlap
215215 chunk_size = total // num_chunks
216216
217- with open (input_file , "rb" , encoding = "utf-8" ) as file :
218- part_num = 1
219- prev_overlap = b""
220- prev_lines : List [bytes ] = []
221- total_read = 0
222-
223- while True :
224- is_first = part_num == 1
217+ if num_lines :
218+ # For line-based chunking, use text mode with UTF-8 encoding
219+ with open (input_file , "r" , encoding = "utf-8" ) as file :
220+ part_num = 1
221+ prev_lines : List [bytes ] = []
225222
226- if num_lines :
227- # Line-based chunking
223+ while True :
224+ is_first = part_num == 1
228225 lines , prev_lines , should_stop = process_line_based_chunk (
229226 file , num_lines , overlap , is_first , prev_lines
230227 )
@@ -233,8 +230,16 @@ def split_file(
233230 write_chunk (lines , input_file , part_num , is_lines = True )
234231 if len (lines ) < (num_lines - (0 if is_first else overlap )):
235232 break
236- else :
237- # Byte-based chunking
233+ part_num += 1
234+ else :
235+ # For byte-based chunking, use binary mode without encoding
236+ with open (input_file , "rb" ) as file :
237+ part_num = 1
238+ prev_overlap = b""
239+ total_read = 0
240+
241+ while True :
242+ is_first = part_num == 1
238243 assert chunk_size is not None # for type checker
239244 result = process_byte_based_chunk (
240245 file ,
@@ -253,10 +258,9 @@ def split_file(
253258 write_chunk (current , input_file , part_num )
254259 if total_read >= file_size and not prev_overlap :
255260 break
256-
257- part_num += 1
258- if num_chunks and part_num > num_chunks :
259- break
261+ part_num += 1
262+ if num_chunks and part_num > num_chunks :
263+ break
260264
261265 # Print summary
262266 print (f"Total chunks created: { part_num - 1 } " )
0 commit comments