Skip to content

Commit d07f56d

Browse files
author
北词你好
committed
update
1 parent ba4f014 commit d07f56d

File tree

2 files changed

+104
-340
lines changed

2 files changed

+104
-340
lines changed

dataload/dataloader.py

Lines changed: 15 additions & 262 deletions
Original file line numberDiff line numberDiff line change
@@ -24,49 +24,43 @@ def __init__(self, dataset=None, dir_path=None):
2424
self.target_folder = "npy_data"
2525

2626
def download_and_cleanup(self, repo_url, dataset_name, local_target_folder):
27-
# 创建临时目录
28-
local_repo_path = os.path.join(os.path.dirname(__file__), "tmp", str(uuid4()))
2927
try:
3028
print(f"Downloading {dataset_name}.npy from {repo_url}")
3129

32-
# 克隆仓库
33-
subprocess.run(['git', 'clone', '--branch', 'main', repo_url, local_repo_path], check=True)
30+
# 直接将仓库克隆到目标文件夹
31+
subprocess.run(['git', 'clone', '--branch', 'main', repo_url, local_target_folder], check=True)
3432

3533
# 确保目标目录存在
3634
os.makedirs(local_target_folder, exist_ok=True)
37-
print(f"Target directory: {local_target_folder}") # 调试信息
35+
print(f"Target directory: {local_target_folder}")
3836

39-
# 搜索.npy文件
37+
# 搜索 .npy 文件
4038
npy_files = []
41-
for root, dirs, files in os.walk(local_repo_path):
39+
for root, dirs, files in os.walk(local_target_folder):
4240
for file in files:
4341
if file == f'{dataset_name}.npy':
4442
npy_files.append(os.path.join(root, file))
4543

4644
if npy_files:
47-
target_file = os.path.join(local_target_folder, f'{dataset_name}.npy')
48-
print(f"Copying from {npy_files[0]} to {target_file}") # 调试信息
49-
shutil.copy2(npy_files[0], target_file)
50-
return True
45+
target_file = npy_files[0] # 使用下载后的文件
46+
print(f"Using downloaded file: {target_file}")
47+
return target_file
5148
else:
5249
print(f"Error: {dataset_name}.npy not found in repository")
53-
return False
54-
50+
return None
51+
5552
except Exception as e:
5653
print(f"Error during download: {str(e)}")
57-
return False
58-
finally:
59-
if os.path.exists(local_repo_path):
60-
shutil.rmtree(local_repo_path)
54+
return None
6155

6256
def download(self):
6357
local_target_folder = os.path.join(self.default_root_path, self.dataset)
64-
success = self.download_and_cleanup(self.repo_url, self.dataset, local_target_folder)
65-
if not success:
58+
file_path = self.download_and_cleanup(self.repo_url, self.dataset, local_target_folder)
59+
if not file_path:
6660
raise RuntimeError(f"Failed to download {self.dataset} dataset.")
67-
return True
61+
return file_path
6862

69-
def load_dataset(self):
63+
def load_data(self):
7064
"""Temporary implementation that returns empty dataset"""
7165
print(f"Loading {self.dataset} dataset (mock data)")
7266
return {
@@ -114,244 +108,3 @@ def get_dataset_name(self):
114108
str: The name of the dataset.
115109
"""
116110
return self.dataset
117-
118-
class MAVEN(DatasetLoader):
119-
def __init__(self, dir_path=None):
120-
super().__init__(dataset='MAVEN', dir_path=dir_path)
121-
122-
def load_data(self):
123-
dataset_path = os.path.join(self.default_root_path, self.dataset)
124-
print(f"Dataset path: {dataset_path}") # 调试信息
125-
126-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
127-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
128-
if not self.download():
129-
raise RuntimeError("Failed to download dataset")
130-
131-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
132-
print(f"Loading file from: {file_path}") # 调试信息
133-
134-
if not os.path.exists(file_path):
135-
print(f"File not found at: {file_path}")
136-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
137-
raise FileNotFoundError(f"Data file not found at {file_path}")
138-
139-
data = np.load(file_path, allow_pickle=True)
140-
df = pd.DataFrame(data, columns=self.required_columns)
141-
print("MAVEN dataset loaded successfully.")
142-
return df
143-
144-
class CrisisNLP(DatasetLoader):
145-
def __init__(self, dir_path=None):
146-
super().__init__(dataset='CrisisNLP', dir_path=dir_path)
147-
148-
def load_data(self):
149-
dataset_path = os.path.join(self.default_root_path, self.dataset)
150-
print(f"Dataset path: {dataset_path}") # 调试信息
151-
152-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
153-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
154-
if not self.download():
155-
raise RuntimeError("Failed to download dataset")
156-
157-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
158-
print(f"Loading file from: {file_path}") # 调试信息
159-
160-
if not os.path.exists(file_path):
161-
print(f"File not found at: {file_path}")
162-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
163-
raise FileNotFoundError(f"Data file not found at {file_path}")
164-
165-
data = np.load(file_path, allow_pickle=True)
166-
df = pd.DataFrame(data, columns=self.required_columns)
167-
print("CrisisNLP dataset loaded successfully.")
168-
return df
169-
170-
class Event2012(DatasetLoader):
171-
def __init__(self, dir_path=None):
172-
super().__init__(dataset='Event2012', dir_path=dir_path)
173-
174-
def load_data(self):
175-
dataset_path = os.path.join(self.default_root_path, self.dataset)
176-
print(f"Dataset path: {dataset_path}") # 调试信息
177-
178-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
179-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
180-
if not self.download():
181-
raise RuntimeError("Failed to download dataset")
182-
183-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
184-
print(f"Loading file from: {file_path}") # 调试信息
185-
186-
if not os.path.exists(file_path):
187-
print(f"File not found at: {file_path}")
188-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
189-
raise FileNotFoundError(f"Data file not found at {file_path}")
190-
191-
data = np.load(file_path, allow_pickle=True)
192-
df = pd.DataFrame(data, columns=self.required_columns)
193-
print("Event2012 dataset loaded successfully.")
194-
return df
195-
196-
class Event2018(DatasetLoader):
197-
def __init__(self, dir_path=None):
198-
super().__init__(dataset='Event2018', dir_path=dir_path)
199-
200-
def load_data(self):
201-
dataset_path = os.path.join(self.default_root_path, self.dataset)
202-
print(f"Dataset path: {dataset_path}") # 调试信息
203-
204-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
205-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
206-
if not self.download():
207-
raise RuntimeError("Failed to download dataset")
208-
209-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
210-
print(f"Loading file from: {file_path}") # 调试信息
211-
212-
if not os.path.exists(file_path):
213-
print(f"File not found at: {file_path}")
214-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
215-
raise FileNotFoundError(f"Data file not found at {file_path}")
216-
217-
data = np.load(file_path, allow_pickle=True)
218-
df = pd.DataFrame(data, columns=self.required_columns)
219-
print("Event2018 dataset loaded successfully.")
220-
return df
221-
222-
class ArabicTwitter(DatasetLoader):
223-
def __init__(self, dir_path=None):
224-
super().__init__(dataset='Arabic_Twitter', dir_path=dir_path)
225-
226-
def load_data(self):
227-
dataset_path = os.path.join(self.default_root_path, self.dataset)
228-
print(f"Dataset path: {dataset_path}") # 调试信息
229-
230-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
231-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
232-
if not self.download():
233-
raise RuntimeError("Failed to download dataset")
234-
235-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
236-
print(f"Loading file from: {file_path}") # 调试信息
237-
238-
if not os.path.exists(file_path):
239-
print(f"File not found at: {file_path}")
240-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
241-
raise FileNotFoundError(f"Data file not found at {file_path}")
242-
243-
data = np.load(file_path, allow_pickle=True)
244-
df = pd.DataFrame(data, columns=self.required_columns)
245-
print("Arabic Twitter dataset loaded successfully.")
246-
return df
247-
248-
class CrisisLexT26(DatasetLoader):
249-
def __init__(self, dir_path=None):
250-
super().__init__(dataset='CrisisLexT26', dir_path=dir_path)
251-
252-
def load_data(self):
253-
dataset_path = os.path.join(self.default_root_path, self.dataset)
254-
print(f"Dataset path: {dataset_path}") # 调试信息
255-
256-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
257-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
258-
if not self.download():
259-
raise RuntimeError("Failed to download dataset")
260-
261-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
262-
print(f"Loading file from: {file_path}") # 调试信息
263-
264-
if not os.path.exists(file_path):
265-
print(f"File not found at: {file_path}")
266-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
267-
raise FileNotFoundError(f"Data file not found at {file_path}")
268-
269-
data = np.load(file_path, allow_pickle=True)
270-
df = pd.DataFrame(data, columns=self.required_columns)
271-
print("CrisisLexT26 dataset loaded successfully.")
272-
return df
273-
274-
class CrisisMMD(DatasetLoader):
275-
def __init__(self, dir_path=None):
276-
super().__init__(dataset='CrisisMMD', dir_path=dir_path)
277-
278-
def load_data(self):
279-
dataset_path = os.path.join(self.default_root_path, self.dataset)
280-
print(f"Dataset path: {dataset_path}") # 调试信息
281-
282-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
283-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
284-
if not self.download():
285-
raise RuntimeError("Failed to download dataset")
286-
287-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
288-
print(f"Loading file from: {file_path}") # 调试信息
289-
290-
if not os.path.exists(file_path):
291-
print(f"File not found at: {file_path}")
292-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
293-
raise FileNotFoundError(f"Data file not found at {file_path}")
294-
295-
data = np.load(file_path, allow_pickle=True)
296-
df = pd.DataFrame(data, columns=self.required_columns)
297-
print("CrisisMMD dataset loaded successfully.")
298-
return df
299-
300-
class HumAID(DatasetLoader):
301-
def __init__(self, dir_path=None):
302-
super().__init__(dataset='HumAID', dir_path=dir_path)
303-
304-
def load_data(self):
305-
dataset_path = os.path.join(self.default_root_path, self.dataset)
306-
print(f"Dataset path: {dataset_path}") # 调试信息
307-
308-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
309-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
310-
if not self.download():
311-
raise RuntimeError("Failed to download dataset")
312-
313-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
314-
print(f"Loading file from: {file_path}") # 调试信息
315-
316-
if not os.path.exists(file_path):
317-
print(f"File not found at: {file_path}")
318-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
319-
raise FileNotFoundError(f"Data file not found at {file_path}")
320-
321-
data = np.load(file_path, allow_pickle=True)
322-
df = pd.DataFrame(data, columns=self.required_columns)
323-
print("HumAID dataset loaded successfully.")
324-
return df
325-
326-
class KBP(DatasetLoader):
327-
def __init__(self, dir_path=None):
328-
super().__init__(dataset='KBP', dir_path=dir_path)
329-
330-
def load_data(self):
331-
dataset_path = os.path.join(self.default_root_path, self.dataset)
332-
print(f"Dataset path: {dataset_path}") # 调试信息
333-
334-
if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
335-
print(f"Directory {dataset_path} does not exist or is empty, downloading...")
336-
if not self.download():
337-
raise RuntimeError("Failed to download dataset")
338-
339-
file_path = os.path.join(dataset_path, f'{self.dataset}.npy')
340-
print(f"Loading file from: {file_path}") # 调试信息
341-
342-
if not os.path.exists(file_path):
343-
print(f"File not found at: {file_path}")
344-
print(f"Directory contents: {os.listdir(dataset_path) if os.path.exists(dataset_path) else 'Directory does not exist'}")
345-
raise FileNotFoundError(f"Data file not found at {file_path}")
346-
347-
data = np.load(file_path, allow_pickle=True)
348-
df = pd.DataFrame(data, columns=self.required_columns)
349-
print("KBP dataset loaded successfully.")
350-
return df
351-
352-
if __name__ == "__main__":
353-
# Test MAVEN dataset
354-
#maven = MAVEN()
355-
#dataset = MAVEN().load_data()
356-
print(Event2018().get_dataset_name())
357-
print(Event2018().get_dataset_language())

0 commit comments

Comments
 (0)