open-telemetry · hannahramadan · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 4, 2025
@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+require_relative 'query_summary/tokenizer'
+require_relative 'query_summary/cache'
+require_relative 'query_summary/parser'
+
+module OpenTelemetry
+  module Helpers
+    # QuerySummary generates high-level summaries of SQL queries, made up of
+    # key operations and table names.
+    #
+    # Example:
+    #   QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1")
+    #   # => "SELECT users"
+    module QuerySummary
+      class << self
+        def configure_cache(size: Cache::DEFAULT_SIZE)
+          cache_instance.configure(size: size)
+        end
+
+        def generate_summary(query)
+          cache_instance.fetch(query) do
+            tokens = Tokenizer.tokenize(query)
+            Parser.build_summary_from_tokens(tokens)
+          end
+        rescue StandardError
+          'UNKNOWN'
+        end
+
+        private
+
+        def cache_instance
+          @cache_instance ||= Cache.new
+        end
+      end
+    end
+  end
+end
@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Cache provides thread-safe LRU caching for query summaries.
+      #
+      # Stores generated query summaries to avoid reprocessing identical queries.
+      # Uses mutex synchronization for thread safety.
+      #
+      # @example
+      #   cache = Cache.new
+      #   cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users"
+      class Cache
+        DEFAULT_SIZE = 1000
+
+        def initialize(size: DEFAULT_SIZE)
+          @cache = {}
+          @cache_mutex = Mutex.new
+          @cache_size = size
+        end
+
+        def fetch(key)
+          @cache_mutex.synchronize do
+            return @cache[key] if @cache.key?(key)
+
+            result = yield
+            evict_if_needed
+            @cache[key] = result
+            result
+          end
+        end
+
+        private
+
+        def configure(size: DEFAULT_SIZE)
+          @cache_size = size
+          @cache.clear if @cache.size > size
+        end
+
+        def clear
+          @cache.clear
+        end
+
+        def evict_if_needed
+          @cache.shift if @cache.size >= @cache_size
+        end
+      end
+    end
+  end
+end
@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Parser builds high-level SQL query summaries from tokenized input.
+      #
+      # Processes tokens to extract key operations and table names, creating
+      # summaries like "SELECT users" or "INSERT INTO orders".
+      #
+      # @example
+      #   tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")]
+      #   Parser.build_summary_from_tokens(tokens) # => "SELECT users"
+      class Parser
+        # Two states: normal parsing vs. waiting for table names
+        PARSING_STATE = :parsing
+        EXPECT_COLLECTION_STATE = :expect_collection
+
+        MAIN_OPERATIONS = %w[SELECT INSERT DELETE].freeze # Operations that start queries and need table names
+        COLLECTION_OPERATIONS = %w[WITH UPDATE].freeze # Operations that work with existing data and expect table names to follow
+        TRIGGER_COLLECTION = %w[FROM INTO JOIN IN].freeze # Keywords that signal a table name is coming next
+        TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze # Database structure operations that create, modify, or remove objects
+        TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze # Types of database objects that can be created, modified, or removed
+
+        class << self
+          def build_summary_from_tokens(tokens)
+            summary_parts = []
+            state = PARSING_STATE
+            skip_until = 0 # Skip tokens we've already processed when looking ahead
+
+            tokens.each_with_index do |token, index|
+              next if index < skip_until
+
+              result = process_token(token, tokens, index, state)
+
+              summary_parts.concat(result[:parts])
+              state = result[:new_state]
+              skip_until = result[:next_index]
+            end
+
+            summary_parts.join(' ')
+          end
+
+          def process_token(token, tokens, index, state)
+            operation_result = process_main_operation(token, tokens, index, state)
+            return operation_result if operation_result[:processed]
+
+            collection_result = process_collection_token(token, tokens, index, state)
+            return collection_result if collection_result[:processed]
+
+            { processed: false, parts: [], new_state: state, next_index: index + 1 }
+          end
+
+          def process_main_operation(token, tokens, index, current_state)
+            upcased_value = token.value.upcase
+
+            case upcased_value
+            when *MAIN_OPERATIONS
+              add_to_summary(token.value, PARSING_STATE, index + 1)
+            when *COLLECTION_OPERATIONS
+              add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1)
+            when *TRIGGER_COLLECTION
+              expect_table_names_next(index + 1)
+            when *TABLE_OPERATIONS
+              handle_table_operation(token, tokens, index)
+            when 'UNION'
+              handle_union(token, tokens, index)
+            else
+              not_processed(current_state, index + 1)
+            end
+          end
+
+          def process_collection_token(token, tokens, index, state)
+            return not_processed(state, index + 1) unless state == EXPECT_COLLECTION_STATE
+
+            upcased_value = token.value.upcase
+
+            if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value))
+              process_table_name_and_alias(token, tokens, index)
+            elsif token.value == '(' || token.type == :operator
+              handle_collection_operator(token, state, index)
+            else
+              return_to_normal_parsing(token, index)
+            end
+          end
+
+          def process_table_name_and_alias(token, tokens, index)
+            # Look ahead to skip table aliases (e.g., "users u" or "users AS u")
+            skip_count = calculate_alias_skip(tokens, index)
+            # Check if there's a comma - if so, expect more table names in the list
+            new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : PARSING_STATE
+            skip_count += 1 if tokens[index + 1 + skip_count]&.value == ','
+
+            { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count }
+          end
+
+          def handle_collection_operator(token, state, index)
+            { processed: true, parts: [], new_state: state, next_index: index + 1 }
+          end
+
+          def return_to_normal_parsing(token, index)
+            { processed: true, parts: [], new_state: PARSING_STATE, next_index: index + 1 }
+          end
+
+          def identifier_like?(token)
+            %i[identifier quoted_identifier string].include?(token.type)
+          end
+
+          def can_be_table_name?(upcased_value)
+            # Object types that can appear after DDL operations
+            TABLE_OBJECTS.include?(upcased_value)
+          end
+
+          def calculate_alias_skip(tokens, index)
+            # Handle both "table AS alias" and "table alias" patterns
+            next_token = tokens[index + 1]
+            if next_token && next_token.value&.upcase == 'AS'
+              2
+            elsif next_token && next_token.type == :identifier
+              1
+            else
+              0
+            end
+          end
+
+          def add_to_summary(part, new_state, next_index)
+            { processed: true, parts: [part], new_state: new_state, next_index: next_index }
+          end
+
+          def expect_table_names_next(next_index)
+            { processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index }
+          end
+
+          def not_processed(current_state, next_index)
+            { processed: false, parts: [], new_state: current_state, next_index: next_index }
+          end
+
+          def handle_union(token, tokens, index)
+            next_token = tokens[index + 1]
+            if next_token && next_token.value&.upcase == 'ALL'
+              { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: PARSING_STATE, next_index: index + 2 }
+            else
+              add_to_summary(token.value, PARSING_STATE, index + 1)
+            end
+          end
+
+          def handle_table_operation(token, tokens, index)
+            # Combine DDL operations with object types: "CREATE TABLE", "DROP INDEX", etc.
+            next_token_obj = tokens[index + 1]
+            next_token = next_token_obj&.value&.upcase
+
+            case next_token
+            when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE'
+              { processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 }
+            else
+              add_to_summary(token.value, PARSING_STATE, index + 1)
+            end
+          end
+        end
+      end
+    end
+  end
+end
@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+require 'strscan'
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Tokenizer breaks down SQL queries into structured tokens for analysis.
+      #
+      # Parses SQL query strings into typed tokens (keywords, identifiers, operators, literals)
+      # for generating query summaries while filtering out sensitive data.
+      #
+      # @example
+      #   tokens = Tokenizer.tokenize("SELECT * FROM users WHERE id = 1")
+      #   # Returns tokens: [keyword: SELECT], [operator: *], [keyword: FROM], etc.
+      class Tokenizer
+        # Token holds the type (e.g., :keyword) and value (e.g., "SELECT")
+        Token = Struct.new(:type, :value)
+
+        # The order of token matching is important for correct parsing,
+        # as more specific patterns should be matched before more general ones.
+        TOKEN_REGEX = {
+          whitespace: /\s+/,
+          comment: %r{--[^\r\n]*|\/\*.*?\*\/}m,
+          numeric: /[+-]?(?:0x[0-9a-fA-F]+|\d+\.?\d*(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?)/,
+          string: /'(?:''|[^'\r\n])*'?/,
+          quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/,
+          keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i,
+          identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/,
+          operator: %r{<=|>=|<>|!=|[=<>+\-*\/%,;()!?]}
+        }.freeze
+
+        EXCLUDED_TYPES = %i[whitespace comment].freeze
+
+        class << self
+          def tokenize(query)
+            scanner = StringScanner.new(query)
+            tokens = []
+
+            scan_next_token(scanner, tokens) until scanner.eos?
+
+            tokens
+          end
+
+          def scan_next_token(scanner, tokens)
+            matched = TOKEN_REGEX.any? do |type, regex|
+              next unless (value = scanner.scan(regex))
+
+              tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type)
+              true
+            end
+            scanner.getch unless matched
+          end
+        end
+      end
+    end
+  end
+end