From 10fbd20b9b05be2727e40745364dac12d77f8613 Mon Sep 17 00:00:00 2001 From: amudha Date: Tue, 13 Aug 2024 09:57:03 +0100 Subject: [PATCH 1/3] pubmed hitCount removed --- fetch_pubmed.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fetch_pubmed.py b/fetch_pubmed.py index 15b535f..3141b48 100644 --- a/fetch_pubmed.py +++ b/fetch_pubmed.py @@ -41,7 +41,6 @@ def call_ePubmedCentral(pubmed_list, uri): if response.status_code == 200: try: pmcjdata = json.loads(response.text) - #hitCount = pmcjdata['hitCount'] if 'result' in pmcjdata['resultList']: result = pmcjdata['resultList']['result'] for pub_data in result: From 49417495a233c7bac213a38fb2d82d60012bc1e4 Mon Sep 17 00:00:00 2001 From: amudha Date: Wed, 15 Jan 2025 09:55:12 +0000 Subject: [PATCH 2/3] README --- README.md | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 82b63f9..522da0b 100644 --- a/README.md +++ b/README.md @@ -1 +1,88 @@ -# added_annotations \ No newline at end of file +# Added annotations: EMICSS (**E**MDB **In**tegration with **C**omplexes, **S**tructures and **S**equences) + +This repository provides tools and scripts for extracting and adding annotations to EMDB entries, which are used to enhance the metadata associated with EM datasets. + +### Table of Contents + +* Installation +* Configuration +* Usage +* Contributing +* License + +### Installation + +To install the necessary dependencies, run: +pip install -r requirements.txt + +### Configuration + +The repository uses a config.ini file for configuration, which is not included in the repository. This file should be created in the root directory of the project with the following structure: + +[file_paths] +uniprot_tab: /uniprot.tsv +CP_ftp: /complextab +components_cif: /components.cif +chem_comp_list: /chem_comp_list.xml +pmc_ftp_gz: /PMID_PMCID_DOI.csv.gz +pmc_ftp: /PMID_PMCID_DOI.csv +emdb_pubmed: /emdb_pubmed.log +emdb_orcid: /emdb_orcid.log +assembly_ftp: /assembly/ +BLAST_DB: /ncbi-blast-2.13.0+/database/uniprot_sprot +BLASTP_BIN: blastp +sifts_GO: /pdb_chain_go.csv +GO_obo: /go.obo +GO_interpro: /nfs/ftp/pub/databases/GO/goa/external2go/interpro2go +sifts: /split_xml/ +alphafold_ftp: /accession_ids.txt +rfam_ftp: /rfam_files_combined.txt + +[api] +pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST + +#### File Sources and Download Links +| File | Descritption | Download Link | +|-------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| +| uniprot.tsv | UniProt annpotations | https://rest.uniprot.org/uniprotkb/stream?fields=accession,xref_pdb,protein_name&query=((database:pdb))&format=tsv&compressed=false | +| complextab | Complex Portal data | https://ftp.ebi.ac.uk/pub/databases/complexportal/complexes.tab.gz | +| components.cif | Chemical components data | https://ftp.ebi.ac.uk/pub/databases/msd/pdbechem_v2/ccd/components.cif | +| chem_comp_list.xml | Chemical component list | https://ftp.ebi.ac.uk/pub/databases/msd/pdbechem_v2/ccd/chem_comp_list.xml | +| PMID_PMCID_DOI.csv.gz | Europe PMC dataset (compressed) | https://europepmc.org/pub/databases/pmc/DOI/PMID_PMCID_DOI.csv.gz | +| PMID_PMCID_DOI.csv | Unzipped version of the Europe PMC dataset | https://ftp.ebi.ac.uk/pub/databases/pmc/DOI/PMID_PMCID_DOI.csv | +| emdb_pubmed | Mapping file created after running PublicationMapping.py | emdb_pubmed.log | +| emdb_orcid | Mapping file created after running PublicationMapping.py | emdb_orcid.log | +| assembly_ftp | PDB assemblies | https://ftp.ebi.ac.uk/pub/databases/msd/assemblies/split/ | +| BLAST_DB | UniProt BLAST database | https://ftp.uniprot.org/pub/databases/uniprot/uniprot_sprot/uniprot_sprot.fasta.gz | +| sifts_GO | PDB chain Gene Ontology mapping | https://ftp.ebi.ac.uk/pub/databases/msd/sifts/pdb_chain_go.csv | +| GO_obo | Gene Ontology definitions | https://current.geneontology.org/ontology/go.obo | +| GO_interpro | InterPro to GO mapping | https://ftp.ebi.ac.uk/pub/databases/GO/goa/external2go/interpro2go | +| sifts | SIFTS data | https://ftp.ebi.ac.uk/pub/databases/msd/sifts/split_xml/ | +| alphafold_ftp | AlphaFold DB accession IDs | https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv | +| rfam_ftp | RFAM files | https://www.ebi.ac.uk/pdbe/search/pdb/select?q=emdb_id:*%20AND%20rfam:%5B*%20TO%20*%5D&wt=csv&fl=emdb_id,pdb_id,rfam,rfam_id,entity_id&rows=9999999 | + +Download EMDB metadata files from https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-xxxx/header/emd-xxxx-v30.xml (replace "xxxxx" with correct EMDB accession number) +Download EMPAIR metadata files from https://ftp.ebi.ac.uk/pub/databases/emtest/empiar/headers/xxxxx.xml (replace "xxxxx" with correct EMPAIR accession number) +Replace with the base directory where you store the files locally. Ensure all required files are downloaded and referenced correctly in the config.ini. Make sure your internet connection is active to query api endpoint during execution. + +### Usage + +To use the tools and scripts in this repository, follow these steps: +Clone the repository: +git clone https://github.com/emdb-empiar/added_annotations.git +cd added_annotations + +Ensure the config.ini file is properly configured as described above. + +#### Executing the scripts: + +Execute the scripts independently in the following recommended order: +* fetch_empiar.py: python fetch_empiar.py -w -f +* fetch_pubmed.py: python fetch_pubmed.py -w -f +* added_annotations.py: python added_annotations.py -w -f --all -t +* fetch_afdb.py: python fetch_afdb.py -w +* write_xml.py: python write_xml.py + +### **Landing Page** + +For more information about EMICSS, visit the official EMICSS landing page (https://www.ebi.ac.uk/emdb/emicss). This page provides detailed information about the EMDB/EMICSS project. \ No newline at end of file From fb7a6bafcac38d7f470670ff600aebaf4845985b Mon Sep 17 00:00:00 2001 From: Neli Fonseca Date: Wed, 15 Jan 2025 10:27:59 +0000 Subject: [PATCH 3/3] Update README.md Updates in README --- README.md | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 522da0b..97d46ac 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ pip install -r requirements.txt The repository uses a config.ini file for configuration, which is not included in the repository. This file should be created in the root directory of the project with the following structure: +``` [file_paths] uniprot_tab: /uniprot.tsv CP_ftp: /complextab @@ -40,6 +41,7 @@ rfam_ftp: /rfam_files_combined.txt [api] pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST +``` #### File Sources and Download Links | File | Descritption | Download Link | @@ -60,29 +62,37 @@ pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST | sifts | SIFTS data | https://ftp.ebi.ac.uk/pub/databases/msd/sifts/split_xml/ | | alphafold_ftp | AlphaFold DB accession IDs | https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv | | rfam_ftp | RFAM files | https://www.ebi.ac.uk/pdbe/search/pdb/select?q=emdb_id:*%20AND%20rfam:%5B*%20TO%20*%5D&wt=csv&fl=emdb_id,pdb_id,rfam,rfam_id,entity_id&rows=9999999 | - -Download EMDB metadata files from https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-xxxx/header/emd-xxxx-v30.xml (replace "xxxxx" with correct EMDB accession number) -Download EMPAIR metadata files from https://ftp.ebi.ac.uk/pub/databases/emtest/empiar/headers/xxxxx.xml (replace "xxxxx" with correct EMPAIR accession number) -Replace with the base directory where you store the files locally. Ensure all required files are downloaded and referenced correctly in the config.ini. Make sure your internet connection is active to query api endpoint during execution. +| emd-xxxx-v30.xml | EMDB metadata | https://ftp.ebi.ac.uk/pub/databases/emdb/ | +| xxxxx.xml | EMPIAR metadata | https://ftp.ebi.ac.uk/pub/databases/emtest/empiar | ### Usage -To use the tools and scripts in this repository, follow these steps: -Clone the repository: -git clone https://github.com/emdb-empiar/added_annotations.git -cd added_annotations - -Ensure the config.ini file is properly configured as described above. +To use the tools and scripts in this repository, you just need to clone it and ensure the config.ini file is properly configured as described above. #### Executing the scripts: Execute the scripts independently in the following recommended order: -* fetch_empiar.py: python fetch_empiar.py -w -f -* fetch_pubmed.py: python fetch_pubmed.py -w -f -* added_annotations.py: python added_annotations.py -w -f --all -t -* fetch_afdb.py: python fetch_afdb.py -w -* write_xml.py: python write_xml.py - -### **Landing Page** - -For more information about EMICSS, visit the official EMICSS landing page (https://www.ebi.ac.uk/emdb/emicss). This page provides detailed information about the EMDB/EMICSS project. \ No newline at end of file +##### EMPIAR mapping +``` +fetch_empiar.py: python fetch_empiar.py -w -f +``` +##### Publication mapping +``` +fetch_pubmed.py: python fetch_pubmed.py -w -f +``` +##### Protein, complexes and ligands mapping +``` +added_annotations.py: python added_annotations.py -w -f --all -t +``` +##### AlphaFold DB mapping +``` +fetch_afdb.py: python fetch_afdb.py -w +``` +##### Write files +``` +write_xml.py: python write_xml.py +``` + +### Further information + +For more information about EMICSS, visit the official EMICSS website (https://www.ebi.ac.uk/emdb/emicss). This page provides detailed information about the EMDB/EMICSS project.