Support string and integer ID property types

jeffreylovitz · jeffreylovitz · commit 2ff81f6a84d7 · 2021-08-09T16:49:03.000-04:00
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ python3 redisgraph_bulk_loader/bulk_insert.py GRAPHNAME [OPTIONS]
 |  -R   | --relations-with-type TEXT |                     Relationship Type followed by path to relationship CSV file                      |
 |  -o   | --separator CHAR           |                         Field token separator in CSV files (default: comma)                          |
 |  -d   | --enforce-schema           |                 Requires each cell to adhere to the schema defined in the CSV header                 |
+|  -i   | --id-type TEXT             |                The data type of unique node ID properties (either STRING or INTEGER)                 |
 |  -s   | --skip-invalid-nodes       |            Skip nodes that reuse previously defined IDs instead of exiting with an error             |
 |  -e   | --skip-invalid-edges       |            Skip edges that use invalid IDs for endpoints instead of exiting with an error            |
 |  -q   | --quote INT                | The quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3 |
@@ -146,7 +147,7 @@ The accepted data types are:
 |        STRING        | A string value                                                    |         Yes          |
 |        ARRAY         | An array value                                                    |         Yes          |
 
-If an `ID` column has a name string, the value will be added to each node as a property. Otherwise, it is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
+If an `ID` column has a name string, the value will be added to each node as a property. This property will be a string by default, though it may be switched to integer using the `--id-type` argument. If the name string is not provided, the ID is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
 
 ### ID Namespaces
 Typically, node identifiers need to be unique across all input CSVs. When using an input schema, it is (optionally) possible to create ID namespaces, and the identifier only needs to be unique across its namespace. This is particularly useful when each input CSV has primary keys which overlap with others.
diff --git a/redisgraph_bulk_loader/bulk_insert.py b/redisgraph_bulk_loader/bulk_insert.py
@@ -60,6 +60,7 @@ def process_entities(entities):
 @click.option('--separator', '-o', default=',', help='Field token separator in csv file')
 # Schema options
 @click.option('--enforce-schema', '-d', default=False, is_flag=True, help='Enforce the schema described in CSV header rows')
+@click.option('--id-type', '-i', default='STRING', help='The data type of unique node ID properties (either STRING or INTEGER)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
 @click.option('--quote', '-q', default=0, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
@@ -70,7 +71,7 @@ def process_entities(entities):
 @click.option('--max-token-size', '-t', default=64, help='max size of each token in megabytes (default 64, max 512)')
 @click.option('--index', '-i', multiple=True, help='Label:Propery on which to create an index')
 @click.option('--full-text-index', '-f', multiple=True, help='Label:Propery on which to create an full text search index')
-def bulk_insert(graph, host, port, password, user, unix_socket_path, nodes, nodes_with_label, relations, relations_with_type, separator, enforce_schema, skip_invalid_nodes, skip_invalid_edges, escapechar, quote, max_token_count, max_buffer_size, max_token_size, index, full_text_index):
+def bulk_insert(graph, host, port, password, user, unix_socket_path, nodes, nodes_with_label, relations, relations_with_type, separator, enforce_schema, id_type, skip_invalid_nodes, skip_invalid_edges, escapechar, quote, max_token_count, max_buffer_size, max_token_size, index, full_text_index):
     if sys.version_info.major < 3 or sys.version_info.minor < 6:
         raise Exception("Python >= 3.6 is required for the RedisGraph bulk loader.")
 
@@ -83,7 +84,7 @@ def bulk_insert(graph, host, port, password, user, unix_socket_path, nodes, node
     store_node_identifiers = any(relations) or any(relations_with_type)
 
     # Initialize configurations with command-line arguments
-    config = Config(max_token_count, max_buffer_size, max_token_size, enforce_schema, skip_invalid_nodes, skip_invalid_edges, separator, int(quote), store_node_identifiers, escapechar)
+    config = Config(max_token_count, max_buffer_size, max_token_size, enforce_schema, id_type, skip_invalid_nodes, skip_invalid_edges, separator, int(quote), store_node_identifiers, escapechar)
 
     # Attempt to connect to Redis server
     try:
diff --git a/redisgraph_bulk_loader/config.py b/redisgraph_bulk_loader/config.py
@@ -1,5 +1,8 @@
+from exceptions import SchemaError
+
+
 class Config:
-    def __init__(self, max_token_count=1024 * 1023, max_buffer_size=64, max_token_size=64, enforce_schema=False, skip_invalid_nodes=False, skip_invalid_edges=False, separator=',', quoting=3, store_node_identifiers=False, escapechar='\\'):
+    def __init__(self, max_token_count=1024 * 1023, max_buffer_size=64, max_token_size=64, enforce_schema=False, id_type='STRING', skip_invalid_nodes=False, skip_invalid_edges=False, separator=',', quoting=3, store_node_identifiers=False, escapechar='\\'):
         """Settings for this run of the bulk loader"""
         # Maximum number of tokens per query
         # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
@@ -13,6 +16,10 @@ def __init__(self, max_token_count=1024 * 1023, max_buffer_size=64, max_token_si
         self.max_token_size = min(max_token_size * 1_000_000, 512 * 1_000_000, self.max_buffer_size)
 
         self.enforce_schema = enforce_schema
+        id_type = str.upper(id_type)
+        if id_type != 'STRING' and id_type != 'INTEGER':
+            raise SchemaError("Specified invalid argument for --id-type, expected STRING or INTEGER")
+        self.id_type = id_type
         self.skip_invalid_nodes = skip_invalid_nodes
         self.skip_invalid_edges = skip_invalid_edges
         self.separator = separator
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
@@ -21,10 +21,11 @@ class Type(Enum):
     INT = 4         # alias to LONG
     INTEGER = 4     # alias to LONG
     ARRAY = 5
-    ID = 6
-    START_ID = 7
-    END_ID = 8
-    IGNORE = 9
+    ID_STRING = 6
+    ID_INTEGER = 7
+    START_ID = 8
+    END_ID = 9
+    IGNORE = 10
 
 
 def convert_schema_type(in_type):
@@ -33,8 +34,8 @@ def convert_schema_type(in_type):
     except KeyError:
         # Handling for ID namespaces
         # TODO think of better alternatives
-        if in_type.startswith('ID('):
-            return Type.ID
+        if in_type.startswith('ID'):
+            return Type.ID_STRING
         elif in_type.startswith('START_ID('):
             return Type.START_ID
         elif in_type.startswith('END_ID('):
@@ -70,8 +71,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
         return struct.pack(format_str, 0)
 
-    # TODO allow ID type specification
-    if prop_type == Type.LONG:
+    if prop_type == Type.ID_INTEGER or prop_type == Type.LONG:
         try:
             numeric_prop = int(prop_val)
             return struct.pack(format_str + "q", Type.LONG.value, numeric_prop)
@@ -99,7 +99,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         else:
             raise SchemaError("Could not parse '%s' as a boolean" % prop_val)
 
-    elif prop_type == Type.ID or prop_type == Type.STRING:
+    elif prop_type == Type.ID_STRING or prop_type == Type.STRING:
         # If we've reached this point, the property is a string
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
@@ -112,7 +112,7 @@ def typed_prop_to_binary(prop_val, prop_type):
         return array_prop_to_binary(format_str, prop_val)
 
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
-    raise SchemaError("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
+    raise SchemaError("unable to parse [" + prop_val + "] with type [" + repr(prop_type) + "]")
 
 
 # Convert a single CSV property field with an inferred type into a binary stream.
@@ -238,7 +238,7 @@ def convert_header_with_schema(self, header):
             col_type = convert_schema_type(pair[1].upper().strip())
 
             # If the column did not have a name but the type requires one, emit an error.
-            if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+            if len(pair[0]) == 0 and col_type not in (Type.ID_STRING, Type.ID_INTEGER, Type.START_ID, Type.END_ID, Type.IGNORE):
                 raise SchemaError("%s: Each property in the header should be a colon-separated pair" % (self.infile.name))
             else:
                 # We have a column name and a type.
@@ -247,6 +247,10 @@ def convert_header_with_schema(self, header):
                     column_name = pair[0].strip()
                     self.column_names[idx] = column_name
 
+            # ID types may be parsed as strings or integers depending on user specification.
+            if col_type == Type.ID_STRING and self.config.id_type == 'INTEGER':
+                col_type = Type.ID_INTEGER
+
             # Store the column type.
             self.types[idx] = col_type
 
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
@@ -30,10 +30,14 @@ def post_process_header_with_schema(self, header):
             return
 
         # Verify that exactly one field is labeled ID.
-        if self.types.count(Type.ID) != 1:
+        if (self.types.count(Type.ID_STRING) + self.types.count(Type.ID_INTEGER)) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
                               % (self.infile.name))
-        self.id = self.types.index(Type.ID) # Track the offset containing the node ID.
+        # Track the offset containing the node ID.
+        try:
+            self.id = self.types.index(Type.ID_STRING)
+        except ValueError:
+            self.id = self.types.index(Type.ID_INTEGER)
         id_field = header[self.id]
         # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace.
         match = re.search(r"\((\w+)\)", id_field)
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
@@ -22,7 +22,7 @@ def process_schemaless_header(self, header):
         self.end_namespace = None
 
         for idx, field in enumerate(header[2:]):
-            self.column_names[idx+2] = field.strip()
+            self.column_names[idx + 2] = field.strip()
 
     def post_process_header_with_schema(self, header):
         # Can interleave these tasks if preferred.
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
@@ -714,6 +714,48 @@ def test18_ensure_full_text_index_is_created(self):
         # We should find only the tamarins
         self.assertEqual(query_result.result_set, expected_result)
 
+    def test19_integer_ids(self):
+        """Validate that IDs can be persisted as integers."""
+
+        graphname = "id_integer_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['id:ID(User)', 'name:STRING'])
+            out.writerow([0, 'Jeffrey'])
+            out.writerow([1, 'Filipe'])
+
+        with open('/tmp/nodes2.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow(['id:ID(Post)', 'views:INT'])
+            out.writerow([0, 20])
+            out.writerow([1, 40])
+
+        with open('/tmp/relations.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow([':START_ID(User)', ':END_ID(Post)'])
+            out.writerow([0, 0])
+            out.writerow([1, 1])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes-with-label', 'User', '/tmp/nodes.tmp',
+                                          '--nodes-with-label', 'Post', '/tmp/nodes2.tmp',
+                                          '--relations-with-type', 'AUTHOR', '/tmp/relations.tmp',
+                                          '--enforce-schema',
+                                          '--id-type', 'integer',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('4 nodes created', res.output)
+        self.assertIn("2 relations created", res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (src)-[]->(dest) RETURN src.id, src.name, LABELS(src), dest.id, dest.views, LABELS(dest) ORDER BY src.id')
+
+        # The IDs of the results should be parsed as integers
+        expected_result = [[0, 'Jeffrey', 'User', 0, 20, 'Post'],
+                           [1, 'Filipe', 'User', 1, 40, 'Post']]
+        self.assertEqual(query_result.result_set, expected_result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_config.py b/test/test_config.py
@@ -10,6 +10,7 @@ def test01_default_values(self):
         self.assertEqual(config.max_buffer_size, 64_000_000)
         self.assertEqual(config.max_token_size, 64_000_000)
         self.assertEqual(config.enforce_schema, False)
+        self.assertEqual(config.id_type, 'STRING')
         self.assertEqual(config.skip_invalid_nodes, False)
         self.assertEqual(config.skip_invalid_edges, False)
         self.assertEqual(config.store_node_identifiers, False)
@@ -18,11 +19,12 @@ def test01_default_values(self):
 
     def test02_modified_values(self):
         """Verify that Config_set updates Config class values accordingly."""
-        config = Config(max_token_count=10, max_buffer_size=500, max_token_size=200, enforce_schema=True, skip_invalid_nodes=True, skip_invalid_edges=True, separator='|', quoting=0)
+        config = Config(max_token_count=10, max_buffer_size=500, max_token_size=200, enforce_schema=True, id_type='INTEGER', skip_invalid_nodes=True, skip_invalid_edges=True, separator='|', quoting=0)
         self.assertEqual(config.max_token_count, 10)
         self.assertEqual(config.max_token_size, 200_000_000) # Max token size argument is converted to megabytes
         self.assertEqual(config.max_buffer_size, 500_000_000) # Buffer size argument is converted to megabytes
         self.assertEqual(config.enforce_schema, True)
+        self.assertEqual(config.id_type, 'INTEGER')
         self.assertEqual(config.skip_invalid_nodes, True)
         self.assertEqual(config.skip_invalid_edges, True)
         self.assertEqual(config.store_node_identifiers, False)
diff --git a/test/test_label.py b/test/test_label.py
@@ -46,5 +46,5 @@ def test02_process_header_with_schema(self):
         self.assertEqual(label.entity_str, 'LabelTest')
         self.assertEqual(label.prop_count, 2)
         self.assertEqual(label.entities_count, 2)
-        self.assertEqual(label.types[0].name, 'ID')
+        self.assertEqual(label.types[0].name, 'ID_STRING')
         self.assertEqual(label.types[1].name, 'STRING')