Skip to content

Commit e576957

Browse files
committed
Added script to link publications in europepmc
1 parent 7805355 commit e576957

File tree

1 file changed

+145
-0
lines changed

1 file changed

+145
-0
lines changed

generate_eupmc_links.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import csv
2+
import logging
3+
import xml.etree.ElementTree as ET
4+
from xml.dom import minidom
5+
from typing import List, Dict
6+
import argparse
7+
import configparser
8+
import os
9+
from ftplib import FTP
10+
from pathlib import Path
11+
import os
12+
13+
14+
def setup_logging():
15+
logging.basicConfig(
16+
level=logging.INFO,
17+
format="%(asctime)s [%(levelname)s] %(message)s",
18+
handlers=[logging.StreamHandler()]
19+
)
20+
21+
22+
def read_tsv(file_path: str) -> List[Dict[str, str]]:
23+
"""Read a TSV file and return a list of dictionaries."""
24+
logging.info(f"Reading TSV file: {file_path}")
25+
with open(file_path, newline='', encoding='utf-8') as f:
26+
reader = csv.DictReader(f, delimiter='\t')
27+
return list(reader)
28+
29+
30+
def build_xml(data: List[Dict[str, str]]) -> ET.Element:
31+
"""Build the XML structure from the TSV data."""
32+
logging.info("Building XML structure")
33+
root = ET.Element("links")
34+
for row in data:
35+
emdb_id = row["EMDB_ID"]
36+
pubmed_id = row["PUBMED_ID"]
37+
38+
link = ET.SubElement(root, "link", providerId="2057")
39+
40+
resource = ET.SubElement(link, "resource")
41+
ET.SubElement(resource, "title").text = emdb_id
42+
ET.SubElement(resource, "url").text = f"https://www.ebi.ac.uk/emdb/{emdb_id}"
43+
44+
record = ET.SubElement(link, "record")
45+
ET.SubElement(record, "source").text = "MED"
46+
ET.SubElement(record, "id").text = pubmed_id
47+
48+
return root
49+
50+
51+
def prettify_xml(elem: ET.Element) -> str:
52+
"""Return a pretty-printed XML string for the Element."""
53+
return minidom.parseString(ET.tostring(elem, 'utf-8')).toprettyxml(indent=" ")
54+
55+
56+
def write_xml(xml_root: ET.Element, output_file: str) -> None:
57+
"""Write the XML tree to a file with pretty formatting."""
58+
logging.info(f"Writing XML to file: {output_file}")
59+
pretty_xml = prettify_xml(xml_root)
60+
with open(output_file, "w", encoding='utf-8') as f:
61+
f.write(pretty_xml)
62+
63+
64+
def upload_file_via_ftp(
65+
server: str,
66+
username: str,
67+
password: str,
68+
local_file_path: str,
69+
remote_dir: str = ".",
70+
remote_filename: str = None
71+
) -> None:
72+
"""
73+
Upload a file to an FTP server. It uses port 21 by default.
74+
75+
Args:
76+
server (str): FTP server address.
77+
username (str): FTP username.
78+
password (str): FTP password.
79+
local_file_path (str): Path to the local file to upload.
80+
remote_dir (str): Remote directory to upload the file to. Default is root.
81+
remote_filename (str): Optional name to give the uploaded file. Defaults to same as local file.
82+
"""
83+
if not os.path.exists(local_file_path):
84+
logging.error(f"Local file does not exist: {local_file_path}")
85+
return
86+
87+
remote_filename = remote_filename or os.path.basename(local_file_path)
88+
89+
try:
90+
logging.info(f"Connecting to FTP server: {server}")
91+
with FTP(server) as ftp:
92+
ftp.login(user=username, passwd=password)
93+
logging.info(f"Logged in as {username}")
94+
95+
ftp.cwd(remote_dir)
96+
logging.info(f"Changed to remote directory: {remote_dir}")
97+
98+
with open(local_file_path, "rb") as file:
99+
ftp.storbinary(f"STOR {remote_filename}", file)
100+
logging.info(f"File uploaded successfully as {remote_filename}")
101+
except Exception as e:
102+
logging.error(f"FTP upload failed: {e}")
103+
return
104+
105+
106+
def main():
107+
setup_logging()
108+
109+
parser = argparse.ArgumentParser()
110+
parser.add_argument('-w', '--workDir', type=str, help="Main working directory path .")
111+
args = parser.parse_args()
112+
base_dir = args.workDir
113+
114+
input_tsv = os.path.join(base_dir, "tab-files/emdb_pubmed.tab")
115+
output_xml = os.path.join(base_dir, "EMDB_linkFile_providerID_2057.xml")
116+
117+
data = read_tsv(input_tsv)
118+
xml_root = build_xml(data)
119+
write_xml(xml_root, output_xml)
120+
121+
logging.info(f"XML file generated: {output_xml}")
122+
123+
# Upload the XML file to the FTP server
124+
logging.info("Uploading XML file to FTP server")
125+
config = configparser.ConfigParser()
126+
env_file = os.path.join(Path(__file__).parent.absolute(), "config.ini")
127+
config.read(env_file)
128+
ftp_server = config.get("epmc_ftp", "server")
129+
ftp_user = config.get("epmc_ftp", "username")
130+
ftp_pass = config.get("epmc_ftp", "password")
131+
ftp_dir = config.get("epmc_ftp", "directory")
132+
133+
upload_file_via_ftp(
134+
server=ftp_server,
135+
username=ftp_user,
136+
password=ftp_pass,
137+
local_file_path=output_xml,
138+
remote_dir=ftp_dir
139+
)
140+
141+
logging.info("Processing complete.")
142+
143+
144+
if __name__ == "__main__":
145+
main()

0 commit comments

Comments
 (0)