@@ -24,49 +24,43 @@ def __init__(self, dataset=None, dir_path=None):
24
24
self .target_folder = "npy_data"
25
25
26
26
def download_and_cleanup (self , repo_url , dataset_name , local_target_folder ):
27
- # 创建临时目录
28
- local_repo_path = os .path .join (os .path .dirname (__file__ ), "tmp" , str (uuid4 ()))
29
27
try :
30
28
print (f"Downloading { dataset_name } .npy from { repo_url } " )
31
29
32
- # 克隆仓库
33
- subprocess .run (['git' , 'clone' , '--branch' , 'main' , repo_url , local_repo_path ], check = True )
30
+ # 直接将仓库克隆到目标文件夹
31
+ subprocess .run (['git' , 'clone' , '--branch' , 'main' , repo_url , local_target_folder ], check = True )
34
32
35
33
# 确保目标目录存在
36
34
os .makedirs (local_target_folder , exist_ok = True )
37
- print (f"Target directory: { local_target_folder } " ) # 调试信息
35
+ print (f"Target directory: { local_target_folder } " )
38
36
39
- # 搜索.npy文件
37
+ # 搜索 .npy 文件
40
38
npy_files = []
41
- for root , dirs , files in os .walk (local_repo_path ):
39
+ for root , dirs , files in os .walk (local_target_folder ):
42
40
for file in files :
43
41
if file == f'{ dataset_name } .npy' :
44
42
npy_files .append (os .path .join (root , file ))
45
43
46
44
if npy_files :
47
- target_file = os .path .join (local_target_folder , f'{ dataset_name } .npy' )
48
- print (f"Copying from { npy_files [0 ]} to { target_file } " ) # 调试信息
49
- shutil .copy2 (npy_files [0 ], target_file )
50
- return True
45
+ target_file = npy_files [0 ] # 使用下载后的文件
46
+ print (f"Using downloaded file: { target_file } " )
47
+ return target_file
51
48
else :
52
49
print (f"Error: { dataset_name } .npy not found in repository" )
53
- return False
54
-
50
+ return None
51
+
55
52
except Exception as e :
56
53
print (f"Error during download: { str (e )} " )
57
- return False
58
- finally :
59
- if os .path .exists (local_repo_path ):
60
- shutil .rmtree (local_repo_path )
54
+ return None
61
55
62
56
def download (self ):
63
57
local_target_folder = os .path .join (self .default_root_path , self .dataset )
64
- success = self .download_and_cleanup (self .repo_url , self .dataset , local_target_folder )
65
- if not success :
58
+ file_path = self .download_and_cleanup (self .repo_url , self .dataset , local_target_folder )
59
+ if not file_path :
66
60
raise RuntimeError (f"Failed to download { self .dataset } dataset." )
67
- return True
61
+ return file_path
68
62
69
- def load_dataset (self ):
63
+ def load_data (self ):
70
64
"""Temporary implementation that returns empty dataset"""
71
65
print (f"Loading { self .dataset } dataset (mock data)" )
72
66
return {
@@ -114,244 +108,3 @@ def get_dataset_name(self):
114
108
str: The name of the dataset.
115
109
"""
116
110
return self .dataset
117
-
118
- class MAVEN (DatasetLoader ):
119
- def __init__ (self , dir_path = None ):
120
- super ().__init__ (dataset = 'MAVEN' , dir_path = dir_path )
121
-
122
- def load_data (self ):
123
- dataset_path = os .path .join (self .default_root_path , self .dataset )
124
- print (f"Dataset path: { dataset_path } " ) # 调试信息
125
-
126
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
127
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
128
- if not self .download ():
129
- raise RuntimeError ("Failed to download dataset" )
130
-
131
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
132
- print (f"Loading file from: { file_path } " ) # 调试信息
133
-
134
- if not os .path .exists (file_path ):
135
- print (f"File not found at: { file_path } " )
136
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
137
- raise FileNotFoundError (f"Data file not found at { file_path } " )
138
-
139
- data = np .load (file_path , allow_pickle = True )
140
- df = pd .DataFrame (data , columns = self .required_columns )
141
- print ("MAVEN dataset loaded successfully." )
142
- return df
143
-
144
- class CrisisNLP (DatasetLoader ):
145
- def __init__ (self , dir_path = None ):
146
- super ().__init__ (dataset = 'CrisisNLP' , dir_path = dir_path )
147
-
148
- def load_data (self ):
149
- dataset_path = os .path .join (self .default_root_path , self .dataset )
150
- print (f"Dataset path: { dataset_path } " ) # 调试信息
151
-
152
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
153
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
154
- if not self .download ():
155
- raise RuntimeError ("Failed to download dataset" )
156
-
157
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
158
- print (f"Loading file from: { file_path } " ) # 调试信息
159
-
160
- if not os .path .exists (file_path ):
161
- print (f"File not found at: { file_path } " )
162
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
163
- raise FileNotFoundError (f"Data file not found at { file_path } " )
164
-
165
- data = np .load (file_path , allow_pickle = True )
166
- df = pd .DataFrame (data , columns = self .required_columns )
167
- print ("CrisisNLP dataset loaded successfully." )
168
- return df
169
-
170
- class Event2012 (DatasetLoader ):
171
- def __init__ (self , dir_path = None ):
172
- super ().__init__ (dataset = 'Event2012' , dir_path = dir_path )
173
-
174
- def load_data (self ):
175
- dataset_path = os .path .join (self .default_root_path , self .dataset )
176
- print (f"Dataset path: { dataset_path } " ) # 调试信息
177
-
178
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
179
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
180
- if not self .download ():
181
- raise RuntimeError ("Failed to download dataset" )
182
-
183
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
184
- print (f"Loading file from: { file_path } " ) # 调试信息
185
-
186
- if not os .path .exists (file_path ):
187
- print (f"File not found at: { file_path } " )
188
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
189
- raise FileNotFoundError (f"Data file not found at { file_path } " )
190
-
191
- data = np .load (file_path , allow_pickle = True )
192
- df = pd .DataFrame (data , columns = self .required_columns )
193
- print ("Event2012 dataset loaded successfully." )
194
- return df
195
-
196
- class Event2018 (DatasetLoader ):
197
- def __init__ (self , dir_path = None ):
198
- super ().__init__ (dataset = 'Event2018' , dir_path = dir_path )
199
-
200
- def load_data (self ):
201
- dataset_path = os .path .join (self .default_root_path , self .dataset )
202
- print (f"Dataset path: { dataset_path } " ) # 调试信息
203
-
204
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
205
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
206
- if not self .download ():
207
- raise RuntimeError ("Failed to download dataset" )
208
-
209
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
210
- print (f"Loading file from: { file_path } " ) # 调试信息
211
-
212
- if not os .path .exists (file_path ):
213
- print (f"File not found at: { file_path } " )
214
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
215
- raise FileNotFoundError (f"Data file not found at { file_path } " )
216
-
217
- data = np .load (file_path , allow_pickle = True )
218
- df = pd .DataFrame (data , columns = self .required_columns )
219
- print ("Event2018 dataset loaded successfully." )
220
- return df
221
-
222
- class ArabicTwitter (DatasetLoader ):
223
- def __init__ (self , dir_path = None ):
224
- super ().__init__ (dataset = 'Arabic_Twitter' , dir_path = dir_path )
225
-
226
- def load_data (self ):
227
- dataset_path = os .path .join (self .default_root_path , self .dataset )
228
- print (f"Dataset path: { dataset_path } " ) # 调试信息
229
-
230
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
231
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
232
- if not self .download ():
233
- raise RuntimeError ("Failed to download dataset" )
234
-
235
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
236
- print (f"Loading file from: { file_path } " ) # 调试信息
237
-
238
- if not os .path .exists (file_path ):
239
- print (f"File not found at: { file_path } " )
240
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
241
- raise FileNotFoundError (f"Data file not found at { file_path } " )
242
-
243
- data = np .load (file_path , allow_pickle = True )
244
- df = pd .DataFrame (data , columns = self .required_columns )
245
- print ("Arabic Twitter dataset loaded successfully." )
246
- return df
247
-
248
- class CrisisLexT26 (DatasetLoader ):
249
- def __init__ (self , dir_path = None ):
250
- super ().__init__ (dataset = 'CrisisLexT26' , dir_path = dir_path )
251
-
252
- def load_data (self ):
253
- dataset_path = os .path .join (self .default_root_path , self .dataset )
254
- print (f"Dataset path: { dataset_path } " ) # 调试信息
255
-
256
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
257
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
258
- if not self .download ():
259
- raise RuntimeError ("Failed to download dataset" )
260
-
261
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
262
- print (f"Loading file from: { file_path } " ) # 调试信息
263
-
264
- if not os .path .exists (file_path ):
265
- print (f"File not found at: { file_path } " )
266
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
267
- raise FileNotFoundError (f"Data file not found at { file_path } " )
268
-
269
- data = np .load (file_path , allow_pickle = True )
270
- df = pd .DataFrame (data , columns = self .required_columns )
271
- print ("CrisisLexT26 dataset loaded successfully." )
272
- return df
273
-
274
- class CrisisMMD (DatasetLoader ):
275
- def __init__ (self , dir_path = None ):
276
- super ().__init__ (dataset = 'CrisisMMD' , dir_path = dir_path )
277
-
278
- def load_data (self ):
279
- dataset_path = os .path .join (self .default_root_path , self .dataset )
280
- print (f"Dataset path: { dataset_path } " ) # 调试信息
281
-
282
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
283
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
284
- if not self .download ():
285
- raise RuntimeError ("Failed to download dataset" )
286
-
287
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
288
- print (f"Loading file from: { file_path } " ) # 调试信息
289
-
290
- if not os .path .exists (file_path ):
291
- print (f"File not found at: { file_path } " )
292
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
293
- raise FileNotFoundError (f"Data file not found at { file_path } " )
294
-
295
- data = np .load (file_path , allow_pickle = True )
296
- df = pd .DataFrame (data , columns = self .required_columns )
297
- print ("CrisisMMD dataset loaded successfully." )
298
- return df
299
-
300
- class HumAID (DatasetLoader ):
301
- def __init__ (self , dir_path = None ):
302
- super ().__init__ (dataset = 'HumAID' , dir_path = dir_path )
303
-
304
- def load_data (self ):
305
- dataset_path = os .path .join (self .default_root_path , self .dataset )
306
- print (f"Dataset path: { dataset_path } " ) # 调试信息
307
-
308
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
309
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
310
- if not self .download ():
311
- raise RuntimeError ("Failed to download dataset" )
312
-
313
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
314
- print (f"Loading file from: { file_path } " ) # 调试信息
315
-
316
- if not os .path .exists (file_path ):
317
- print (f"File not found at: { file_path } " )
318
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
319
- raise FileNotFoundError (f"Data file not found at { file_path } " )
320
-
321
- data = np .load (file_path , allow_pickle = True )
322
- df = pd .DataFrame (data , columns = self .required_columns )
323
- print ("HumAID dataset loaded successfully." )
324
- return df
325
-
326
- class KBP (DatasetLoader ):
327
- def __init__ (self , dir_path = None ):
328
- super ().__init__ (dataset = 'KBP' , dir_path = dir_path )
329
-
330
- def load_data (self ):
331
- dataset_path = os .path .join (self .default_root_path , self .dataset )
332
- print (f"Dataset path: { dataset_path } " ) # 调试信息
333
-
334
- if not os .path .exists (dataset_path ) or not os .listdir (dataset_path ):
335
- print (f"Directory { dataset_path } does not exist or is empty, downloading..." )
336
- if not self .download ():
337
- raise RuntimeError ("Failed to download dataset" )
338
-
339
- file_path = os .path .join (dataset_path , f'{ self .dataset } .npy' )
340
- print (f"Loading file from: { file_path } " ) # 调试信息
341
-
342
- if not os .path .exists (file_path ):
343
- print (f"File not found at: { file_path } " )
344
- print (f"Directory contents: { os .listdir (dataset_path ) if os .path .exists (dataset_path ) else 'Directory does not exist' } " )
345
- raise FileNotFoundError (f"Data file not found at { file_path } " )
346
-
347
- data = np .load (file_path , allow_pickle = True )
348
- df = pd .DataFrame (data , columns = self .required_columns )
349
- print ("KBP dataset loaded successfully." )
350
- return df
351
-
352
- if __name__ == "__main__" :
353
- # Test MAVEN dataset
354
- #maven = MAVEN()
355
- #dataset = MAVEN().load_data()
356
- print (Event2018 ().get_dataset_name ())
357
- print (Event2018 ().get_dataset_language ())
0 commit comments