11import logging
22import re
3- from dataclasses import dataclass
3+ from dataclasses import dataclass , field
44from typing import Dict , Final , Iterable , List , Optional , Tuple
55
66from sqlalchemy .engine .url import make_url
@@ -1512,7 +1512,8 @@ def get_platform(self) -> str:
15121512 try :
15131513 parser = self .get_parser (self .connector_manifest )
15141514 return parser .source_platform
1515- except Exception :
1515+ except Exception as e :
1516+ logger .debug (f"Failed to get platform from parser: { e } " )
15161517 # If parser fails, try to infer from JDBC URL directly
15171518 jdbc_url = self .connector_manifest .config .get ("connection.url" , "" )
15181519 if jdbc_url :
@@ -1531,6 +1532,8 @@ class SnowflakeSourceConnector(BaseConnector):
15311532 Topic naming: <topic.prefix><database.schema.tableName>
15321533 """
15331534
1535+ _cached_expanded_tables : Optional [List [str ]] = field (default = None , init = False )
1536+
15341537 @dataclass
15351538 class SnowflakeSourceParser :
15361539 source_platform : str
@@ -1770,6 +1773,10 @@ def _query_tables_from_datahub(
17701773 )
17711774 regex = re .compile (regex_pattern )
17721775
1776+ # TODO: Performance optimization - This loops through ALL datasets in DataHub
1777+ # for the platform without filtering. For large DataHub instances with thousands
1778+ # of tables, this could be very slow. Consider using graph.get_urns_by_filter()
1779+ # with more specific filters or implementing pagination.
17731780 for urn in all_urns :
17741781 # URN format: urn:li:dataset:(urn:li:dataPlatform:snowflake,database.schema.table,PROD)
17751782 table_name = self ._extract_table_name_from_urn (urn )
@@ -1789,31 +1796,20 @@ def _query_tables_from_datahub(
17891796 )
17901797 return matched_tables
17911798
1799+ except (ConnectionError , TimeoutError ) as e :
1800+ logger .error (f"Failed to connect to DataHub for pattern '{ pattern } ': { e } " )
1801+ if self .report :
1802+ self .report .report_failure (
1803+ f"datahub_connection_{ self .connector_manifest .name } " , str (e )
1804+ )
1805+ return []
17921806 except Exception as e :
17931807 logger .warning (
1794- f"Failed to query tables from DataHub for pattern '{ pattern } ': { e } "
1808+ f"Failed to query tables from DataHub for pattern '{ pattern } ': { e } " ,
1809+ exc_info = True ,
17951810 )
17961811 return []
17971812
1798- def _extract_table_name_from_urn (self , urn : str ) -> Optional [str ]:
1799- """
1800- Extract table name from DataHub URN.
1801-
1802- URN format: urn:li:dataset:(urn:li:dataPlatform:snowflake,database.schema.table,PROD)
1803- Returns: database.schema.table
1804- """
1805- try :
1806- # Simple parsing - extract between second comma and third comma
1807- parts = urn .split ("," )
1808- if len (parts ) >= 2 :
1809- # Second part contains the table name
1810- table_name = parts [1 ]
1811- return table_name
1812- except Exception as e :
1813- logger .debug (f"Failed to extract table name from URN { urn } : { e } " )
1814-
1815- return None
1816-
18171813 def extract_lineages (self ) -> List [KafkaConnectLineage ]:
18181814 """
18191815 Extract lineage mappings from Snowflake tables to Kafka topics.
@@ -1836,7 +1832,7 @@ def extract_lineages(self) -> List[KafkaConnectLineage]:
18361832 return []
18371833
18381834 # Check if we have cached expanded tables from get_topics_from_config()
1839- if hasattr ( self , " _cached_expanded_tables" ) :
1835+ if self . _cached_expanded_tables is not None :
18401836 table_names = self ._cached_expanded_tables
18411837 if not table_names :
18421838 logger .debug (
@@ -2386,8 +2382,9 @@ def _expand_table_patterns(
23862382 List of fully expanded table names
23872383 """
23882384 # Check if feature is enabled
2389- if not getattr (self .config , "use_schema_resolver" , False ) or not getattr (
2390- self .config , "schema_resolver_expand_patterns" , False
2385+ if (
2386+ not self .config .use_schema_resolver
2387+ or not self .config .schema_resolver_expand_patterns
23912388 ):
23922389 # Fall back to original behavior - parse as-is
23932390 return parse_comma_separated_list (table_config )
@@ -2537,31 +2534,20 @@ def _query_tables_from_datahub(
25372534 )
25382535 return matched_tables
25392536
2537+ except (ConnectionError , TimeoutError ) as e :
2538+ logger .error (f"Failed to connect to DataHub for pattern '{ pattern } ': { e } " )
2539+ if self .report :
2540+ self .report .report_failure (
2541+ f"datahub_connection_{ self .connector_manifest .name } " , str (e )
2542+ )
2543+ return []
25402544 except Exception as e :
25412545 logger .warning (
2542- f"Failed to query tables from DataHub for pattern '{ pattern } ': { e } "
2546+ f"Failed to query tables from DataHub for pattern '{ pattern } ': { e } " ,
2547+ exc_info = True ,
25432548 )
25442549 return []
25452550
2546- def _extract_table_name_from_urn (self , urn : str ) -> Optional [str ]:
2547- """
2548- Extract table name from DataHub URN.
2549-
2550- URN format: urn:li:dataset:(urn:li:dataPlatform:postgres,database.schema.table,PROD)
2551- Returns: database.schema.table
2552- """
2553- try :
2554- # Simple parsing - extract between second comma and third comma
2555- parts = urn .split ("," )
2556- if len (parts ) >= 2 :
2557- # Second part contains the table name
2558- table_name = parts [1 ]
2559- return table_name
2560- except Exception as e :
2561- logger .debug (f"Failed to extract table name from URN { urn } : { e } " )
2562-
2563- return None
2564-
25652551
25662552@dataclass
25672553class ConfigDrivenSourceConnector (BaseConnector ):
0 commit comments