Couple of pr review fixes

treff7es · treff7es · commit dcef74656831 · 2025-11-10T10:51:25.000+01:00
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py
@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Dict, Final, List, Optional
+from typing import TYPE_CHECKING, Dict, Final, List, Optional, TypedDict
 
 from pydantic import model_validator
 from pydantic.fields import Field
@@ -18,6 +18,7 @@
     StatefulIngestionConfigBase,
 )
 from datahub.utilities.lossy_collections import LossyList
+from datahub.utilities.urns.dataset_urn import DatasetUrn
 
 if TYPE_CHECKING:
     from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -33,6 +34,15 @@
 DEFAULT_CONNECT_URI: Final[str] = "http://localhost:8083/"
 
 
+class FineGrainedLineageDict(TypedDict):
+    """Structure for fine-grained (column-level) lineage mappings."""
+
+    upstreamType: str
+    downstreamType: str
+    upstreams: List[str]
+    downstreams: List[str]
+
+
 class ConnectorConfigKeys:
     """Centralized configuration keys to avoid magic strings throughout the codebase."""
 
@@ -361,7 +371,7 @@ class KafkaConnectLineage:
     target_platform: str
     job_property_bag: Optional[Dict[str, str]] = None
     source_dataset: Optional[str] = None
-    fine_grained_lineages: Optional[List[Dict[str, Any]]] = None
+    fine_grained_lineages: Optional[List[FineGrainedLineageDict]] = None
 
 
 @dataclass
@@ -664,7 +674,7 @@ def _extract_fine_grained_lineage(
         source_platform: str,
         target_dataset: str,
         target_platform: str = "kafka",
-    ) -> Optional[List[Dict[str, Any]]]:
+    ) -> Optional[List[FineGrainedLineageDict]]:
         """
         Extract column-level lineage using schema metadata from DataHub.
 
@@ -681,9 +691,9 @@ def _extract_fine_grained_lineage(
             List of fine-grained lineage dictionaries or None if not available
         """
         # Check if feature is enabled
-        if not getattr(self.config, "use_schema_resolver", False):
+        if not self.config.use_schema_resolver:
             return None
-        if not getattr(self.config, "schema_resolver_finegrained_lineage", False):
+        if not self.config.schema_resolver_finegrained_lineage:
             return None
         if not self.schema_resolver:
             return None
@@ -718,10 +728,10 @@ def _extract_fine_grained_lineage(
 
             # Create fine-grained lineage for each source column
             # Assume 1:1 mapping (column names are preserved)
-            fine_grained_lineages = []
+            fine_grained_lineages: List[FineGrainedLineageDict] = []
 
             for source_col in source_schema:
-                fine_grained_lineage = {
+                fine_grained_lineage: FineGrainedLineageDict = {
                     "upstreamType": "FIELD_SET",
                     "downstreamType": "FIELD",
                     "upstreams": [make_schema_field_urn(source_urn_str, source_col)],
@@ -744,5 +754,23 @@ def _extract_fine_grained_lineage(
 
         return None
 
+    def _extract_table_name_from_urn(self, urn: str) -> Optional[str]:
+        """
+        Extract table name from DataHub URN using standard DatasetUrn parser.
+
+        Args:
+            urn: DataHub dataset URN
+                Format: urn:li:dataset:(urn:li:dataPlatform:platform,table_name,ENV)
+                Example: urn:li:dataset:(urn:li:dataPlatform:snowflake,database.schema.table,PROD)
+
+        Returns:
+            Extracted table name (e.g., "database.schema.table") or None if parsing fails
+        """
+        try:
+            return DatasetUrn.from_string(urn).name
+        except Exception as e:
+            logger.debug(f"Failed to extract table name from URN {urn}: {e}")
+            return None
+
 
 # Removed: TopicResolver and ConnectorTopicHandlerRegistry - logic moved directly to BaseConnector subclasses
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py
@@ -1,6 +1,6 @@
 import logging
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Dict, Final, Iterable, List, Optional, Tuple
 
 from sqlalchemy.engine.url import make_url
@@ -1512,7 +1512,8 @@ def get_platform(self) -> str:
         try:
             parser = self.get_parser(self.connector_manifest)
             return parser.source_platform
-        except Exception:
+        except Exception as e:
+            logger.debug(f"Failed to get platform from parser: {e}")
             # If parser fails, try to infer from JDBC URL directly
             jdbc_url = self.connector_manifest.config.get("connection.url", "")
             if jdbc_url:
@@ -1531,6 +1532,8 @@ class SnowflakeSourceConnector(BaseConnector):
     Topic naming: <topic.prefix><database.schema.tableName>
     """
 
+    _cached_expanded_tables: Optional[List[str]] = field(default=None, init=False)
+
     @dataclass
     class SnowflakeSourceParser:
         source_platform: str
@@ -1770,6 +1773,10 @@ def _query_tables_from_datahub(
             )
             regex = re.compile(regex_pattern)
 
+            # TODO: Performance optimization - This loops through ALL datasets in DataHub
+            # for the platform without filtering. For large DataHub instances with thousands
+            # of tables, this could be very slow. Consider using graph.get_urns_by_filter()
+            # with more specific filters or implementing pagination.
             for urn in all_urns:
                 # URN format: urn:li:dataset:(urn:li:dataPlatform:snowflake,database.schema.table,PROD)
                 table_name = self._extract_table_name_from_urn(urn)
@@ -1789,31 +1796,20 @@ def _query_tables_from_datahub(
             )
             return matched_tables
 
+        except (ConnectionError, TimeoutError) as e:
+            logger.error(f"Failed to connect to DataHub for pattern '{pattern}': {e}")
+            if self.report:
+                self.report.report_failure(
+                    f"datahub_connection_{self.connector_manifest.name}", str(e)
+                )
+            return []
         except Exception as e:
             logger.warning(
-                f"Failed to query tables from DataHub for pattern '{pattern}': {e}"
+                f"Failed to query tables from DataHub for pattern '{pattern}': {e}",
+                exc_info=True,
             )
             return []
 
-    def _extract_table_name_from_urn(self, urn: str) -> Optional[str]:
-        """
-        Extract table name from DataHub URN.
-
-        URN format: urn:li:dataset:(urn:li:dataPlatform:snowflake,database.schema.table,PROD)
-        Returns: database.schema.table
-        """
-        try:
-            # Simple parsing - extract between second comma and third comma
-            parts = urn.split(",")
-            if len(parts) >= 2:
-                # Second part contains the table name
-                table_name = parts[1]
-                return table_name
-        except Exception as e:
-            logger.debug(f"Failed to extract table name from URN {urn}: {e}")
-
-        return None
-
     def extract_lineages(self) -> List[KafkaConnectLineage]:
         """
         Extract lineage mappings from Snowflake tables to Kafka topics.
@@ -1836,7 +1832,7 @@ def extract_lineages(self) -> List[KafkaConnectLineage]:
             return []
 
         # Check if we have cached expanded tables from get_topics_from_config()
-        if hasattr(self, "_cached_expanded_tables"):
+        if self._cached_expanded_tables is not None:
             table_names = self._cached_expanded_tables
             if not table_names:
                 logger.debug(
@@ -2386,8 +2382,9 @@ def _expand_table_patterns(
             List of fully expanded table names
         """
         # Check if feature is enabled
-        if not getattr(self.config, "use_schema_resolver", False) or not getattr(
-            self.config, "schema_resolver_expand_patterns", False
+        if (
+            not self.config.use_schema_resolver
+            or not self.config.schema_resolver_expand_patterns
         ):
             # Fall back to original behavior - parse as-is
             return parse_comma_separated_list(table_config)
@@ -2537,31 +2534,20 @@ def _query_tables_from_datahub(
             )
             return matched_tables
 
+        except (ConnectionError, TimeoutError) as e:
+            logger.error(f"Failed to connect to DataHub for pattern '{pattern}': {e}")
+            if self.report:
+                self.report.report_failure(
+                    f"datahub_connection_{self.connector_manifest.name}", str(e)
+                )
+            return []
         except Exception as e:
             logger.warning(
-                f"Failed to query tables from DataHub for pattern '{pattern}': {e}"
+                f"Failed to query tables from DataHub for pattern '{pattern}': {e}",
+                exc_info=True,
             )
             return []
 
-    def _extract_table_name_from_urn(self, urn: str) -> Optional[str]:
-        """
-        Extract table name from DataHub URN.
-
-        URN format: urn:li:dataset:(urn:li:dataPlatform:postgres,database.schema.table,PROD)
-        Returns: database.schema.table
-        """
-        try:
-            # Simple parsing - extract between second comma and third comma
-            parts = urn.split(",")
-            if len(parts) >= 2:
-                # Second part contains the table name
-                table_name = parts[1]
-                return table_name
-        except Exception as e:
-            logger.debug(f"Failed to extract table name from URN {urn}: {e}")
-
-        return None
-
 
 @dataclass
 class ConfigDrivenSourceConnector(BaseConnector):