From 8bd9498251a5c8cbbbde57f60914fc5e70d8f9b3 Mon Sep 17 00:00:00 2001 From: Renato Marroquin Date: Sun, 24 Sep 2023 20:16:32 +0200 Subject: [PATCH 1/4] ISSUE-289: Replace old metadata with metadata from new file --- .../tagbase_server/utils/processing_utils.py | 44 ++++++++----------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/tagbase_server/tagbase_server/utils/processing_utils.py b/tagbase_server/tagbase_server/utils/processing_utils.py index 57fa4c7..7ec72fa 100644 --- a/tagbase_server/tagbase_server/utils/processing_utils.py +++ b/tagbase_server/tagbase_server/utils/processing_utils.py @@ -312,27 +312,23 @@ def update_submission_metadata( ): # update submission information current_time = dt.now(tz=pytz.utc).astimezone(get_localzone()) - cur.execute( - "UPDATE submission SET md_sha256 = '{}', date_time = '{}'" - " WHERE tag_id = {} AND dataset_id = {} AND submission_id = {}".format( + update_submission_info_query =\ + "UPDATE submission SET md_sha256 = '{}', date_time = '{}' " \ + "WHERE tag_id = {} AND dataset_id = {} AND submission_id = {}".format( metadata_hash, current_time, tag_id, dataset_id, submission_id ) - ) + cur.execute(update_submission_info_query) logger.info( "Submission_id=%s updated with metadata hash=%s", submission_id, metadata_hash ) - # update metadata attributes - for x in metadata: - submission_id = x[0] - attribute_id = x[1] - attribute_value = x[2] - attribute_value = str(attribute_value).strip('"') - cur.execute( - "UPDATE metadata SET attribute_value = '{}' WHERE submission_id = {} AND tag_id = {} AND attribute_id = {}".format( - attribute_value, submission_id, tag_id, attribute_id - ) - ) + # delete previous metadata since we are going to override it + delete_md_query = "DELETE FROM metadata WHERE submission_id = {} AND tag_id = {}".format(submission_id, tag_id) + cur.execute(delete_md_query) + logger.debug("Removed old metadata from submission_id=%s tag_id=%s", submission_id, tag_id) + + # insert new metadata + insert_metadata(cur, metadata, submission_id) logger.info("Updated metadata attributes: %s", metadata) @@ -347,7 +343,6 @@ def process_etuff_file(file, version=None, notes=None): conn = connect() conn.autocommit = True - # TODO we should read the file once and return the hashes we need (metadata/content/entire-file) ( instrument_name, serial_number, @@ -359,11 +354,14 @@ def process_etuff_file(file, version=None, notes=None): number_global_attributes_lines, ) = get_dataset_properties(submission_filename) content_hash = make_hash_sha256(file_content) - logger.debug("Content Hash: %s", content_hash) metadata_hash = make_hash_sha256(metadata_content) - logger.debug("MD Hash: %s", metadata_hash) entire_file_hash = compute_file_sha256(submission_filename) - logger.debug("File Hash: %s", entire_file_hash) + logger.debug( + "Content Hash: %s\tMetadata Hash: %s\tFile Hash: %s", + content_hash, + metadata_hash, + entire_file_hash + ) with conn: with conn.cursor() as cur: @@ -415,17 +413,11 @@ def process_etuff_file(file, version=None, notes=None): ) return 1 + # at this point we have already read form the file all global attribute lines proc_obs = [] variable_lookup = {} - # at this point we have already read form the file all global attribute lines - # line_counter = number_global_attributes_lines - # # TODO we should use the 'content' variable in the following s_time = time.perf_counter() - # with open(file, "rb") as data: - # lines = [line.decode("utf-8", "ignore") for line in data.readlines()] - # lines_length = len(lines) - num_lines_content = len(file_content) logger.debug( "len number_global_atttributes_lines: '%s' len lines_length: '%s'", From 96b44e53b378892af1c97d26c0e50a519f758d69 Mon Sep 17 00:00:00 2001 From: Renato Marroquin Date: Sun, 24 Sep 2023 20:24:37 +0200 Subject: [PATCH 2/4] ISSUE-289: Fix format --- .../tagbase_server/utils/processing_utils.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tagbase_server/tagbase_server/utils/processing_utils.py b/tagbase_server/tagbase_server/utils/processing_utils.py index 7ec72fa..f2d74dc 100644 --- a/tagbase_server/tagbase_server/utils/processing_utils.py +++ b/tagbase_server/tagbase_server/utils/processing_utils.py @@ -312,20 +312,27 @@ def update_submission_metadata( ): # update submission information current_time = dt.now(tz=pytz.utc).astimezone(get_localzone()) - update_submission_info_query =\ - "UPDATE submission SET md_sha256 = '{}', date_time = '{}' " \ + update_submission_info_query = ( + "UPDATE submission SET md_sha256 = '{}', date_time = '{}' " "WHERE tag_id = {} AND dataset_id = {} AND submission_id = {}".format( metadata_hash, current_time, tag_id, dataset_id, submission_id ) + ) cur.execute(update_submission_info_query) logger.info( "Submission_id=%s updated with metadata hash=%s", submission_id, metadata_hash ) # delete previous metadata since we are going to override it - delete_md_query = "DELETE FROM metadata WHERE submission_id = {} AND tag_id = {}".format(submission_id, tag_id) + delete_md_query = ( + "DELETE FROM metadata WHERE submission_id = {} AND tag_id = {}".format( + submission_id, tag_id + ) + ) cur.execute(delete_md_query) - logger.debug("Removed old metadata from submission_id=%s tag_id=%s", submission_id, tag_id) + logger.debug( + "Removed old metadata from submission_id=%s tag_id=%s", submission_id, tag_id + ) # insert new metadata insert_metadata(cur, metadata, submission_id) @@ -360,7 +367,7 @@ def process_etuff_file(file, version=None, notes=None): "Content Hash: %s\tMetadata Hash: %s\tFile Hash: %s", content_hash, metadata_hash, - entire_file_hash + entire_file_hash, ) with conn: From c5a494bc30f2e8f08ca8c1aa36a222a153ed0a3c Mon Sep 17 00:00:00 2001 From: Renato Marroquin Date: Sat, 30 Sep 2023 21:37:49 +0200 Subject: [PATCH 3/4] ISSUE-289: Add unit tests --- .../tagbase_server/test/test_ingest.py | 32 +++++++++++++++++++ .../tagbase_server/utils/processing_utils.py | 4 +-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tagbase_server/tagbase_server/test/test_ingest.py b/tagbase_server/tagbase_server/test/test_ingest.py index 68e5e04..2118105 100644 --- a/tagbase_server/tagbase_server/test/test_ingest.py +++ b/tagbase_server/tagbase_server/test/test_ingest.py @@ -92,6 +92,38 @@ def test_get_dataset_id(self, mock_connect): tag_id = pu.get_tag_id(cur, 1) assert tag_id, "1" + @mock.patch("psycopg2.connect") + def test_is_only_metadata_change(self, mock_connect): + metadata_hash_stored = ["some_hash"] + file_md_hash = "some_other_hash" + # result of psycopg2.connect(**connection_stuff) + mock_con = mock_connect.return_value + # result of con.cursor(cursor_factory=DictCursor) + mock_cur = mock_con.cursor.return_value + # return this when calling cur.fetchall() + mock_cur.fetchall.return_value = metadata_hash_stored + conn = psycopg2.connect( + dbname="test", + user="test", + host="localhost", + port="32780", + password="test", + ) + cur = conn.cursor() + + # if the method returns anything means that metadata found is different + is_only_metadata_change = pu.is_only_metadata_change( + cur, metadata_hash_stored[0], file_md_hash + ) + assert is_only_metadata_change, True + + # no different metadata found + mock_cur.fetchall.return_value = None + is_only_metadata_change = pu.is_only_metadata_change( + cur, metadata_hash_stored[0], file_md_hash + ) + assert is_only_metadata_change, False + @mock.patch("psycopg2.connect") def test_processing_file_metadata_with_existing_attributes(self, mock_connect): metadata_attribs_in_db = [[1, "instrument_name"], [2, "model"]] diff --git a/tagbase_server/tagbase_server/utils/processing_utils.py b/tagbase_server/tagbase_server/utils/processing_utils.py index f2d74dc..d61a06f 100644 --- a/tagbase_server/tagbase_server/utils/processing_utils.py +++ b/tagbase_server/tagbase_server/utils/processing_utils.py @@ -262,13 +262,13 @@ def get_dataset_properties(submission_filename): ) -def is_only_metadata_change(cursor, metadata_hash, file_content_hash): +def is_only_metadata_change(cursor, metadata_hash, file_data_hash): logger.debug("Detecting metadata submitted...") cursor.execute( "SELECT md_sha256 FROM submission WHERE md_sha256 <> %s AND data_sha256 = %s ", ( metadata_hash, - file_content_hash, + file_data_hash, ), ) db_results = cursor.fetchone() From d20d39b45ecfde79f346febea84f80d398a200f5 Mon Sep 17 00:00:00 2001 From: Renato Marroquin Date: Sat, 30 Sep 2023 21:55:37 +0200 Subject: [PATCH 4/4] ISSUE-289: Add unit tests --- .../tagbase_server/test/test_ingest.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tagbase_server/tagbase_server/test/test_ingest.py b/tagbase_server/tagbase_server/test/test_ingest.py index 2118105..fa1eb1e 100644 --- a/tagbase_server/tagbase_server/test/test_ingest.py +++ b/tagbase_server/tagbase_server/test/test_ingest.py @@ -124,6 +124,36 @@ def test_is_only_metadata_change(self, mock_connect): ) assert is_only_metadata_change, False + @mock.patch("psycopg2.connect") + def test_update_submission_metadata(self, mock_connect): + submission_id = 1 + metadata_attributes = [ + (submission_id, "instrument_name", "some_instrument"), + (submission_id, "model", "some_model"), + ] + # result of psycopg2.connect(**connection_stuff) + mock_con = mock_connect.return_value + # result of con.cursor(cursor_factory=DictCursor) + mock_cur = mock_con.cursor.return_value + # return this when calling cur.fetchall() + mock_cur.fetchall.return_value = metadata_attributes + + conn = psycopg2.connect( + dbname="test", + user="test", + host="localhost", + port="32780", + password="test", + ) + cur = conn.cursor() + tag_id = 1 + dataset_id = 1 + metadata_hash = "some_hash" + + pu.update_submission_metadata( + cur, tag_id, metadata_attributes, submission_id, dataset_id, metadata_hash + ) + @mock.patch("psycopg2.connect") def test_processing_file_metadata_with_existing_attributes(self, mock_connect): metadata_attribs_in_db = [[1, "instrument_name"], [2, "model"]]