Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# frozen_string_literal: true

# Copyright The OpenTelemetry Authors
#
# SPDX-License-Identifier: Apache-2.0

require_relative 'query_summary/tokenizer'
require_relative 'query_summary/cache'
require_relative 'query_summary/parser'

module OpenTelemetry
module Helpers
# QuerySummary generates high-level summaries of SQL queries, made up of
# key operations and table names.
#
# Example:
# QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1")
# # => "SELECT users"
module QuerySummary
class << self
def configure_cache(size: Cache::DEFAULT_SIZE)
cache_instance.configure(size: size)
end

def generate_summary(query)
cache_instance.fetch(query) do
tokens = Tokenizer.tokenize(query)
Parser.build_summary_from_tokens(tokens)
end
rescue StandardError
'UNKNOWN'
end

private

def cache_instance
@cache_instance ||= Cache.new
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# frozen_string_literal: true

# Copyright The OpenTelemetry Authors
#
# SPDX-License-Identifier: Apache-2.0

module OpenTelemetry
module Helpers
module QuerySummary
# Cache provides thread-safe LRU caching for query summaries.
#
# Stores generated query summaries to avoid reprocessing identical queries.
# Uses mutex synchronization for thread safety.
#
# @example
# cache = Cache.new
# cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users"
class Cache
DEFAULT_SIZE = 1000

def initialize(size: DEFAULT_SIZE)
@cache = {}
@cache_mutex = Mutex.new
@cache_size = size
end

def fetch(key)
@cache_mutex.synchronize do
return @cache[key] if @cache.key?(key)

result = yield
evict_if_needed
@cache[key] = result
result
end
end

private

def configure(size: DEFAULT_SIZE)
@cache_size = size
@cache.clear if @cache.size > size
end

def clear
@cache.clear
end

def evict_if_needed
@cache.shift if @cache.size >= @cache_size
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# frozen_string_literal: true

# Copyright The OpenTelemetry Authors
#
# SPDX-License-Identifier: Apache-2.0

module OpenTelemetry
module Helpers
module QuerySummary
# Parser builds high-level SQL query summaries from tokenized input.
#
# Processes tokens to extract key operations and table names, creating
# summaries like "SELECT users" or "INSERT INTO orders".
#
# @example
# tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")]
# Parser.build_summary_from_tokens(tokens) # => "SELECT users"
class Parser
# Two states: normal parsing vs. waiting for table names
PARSING_STATE = :parsing
EXPECT_COLLECTION_STATE = :expect_collection

MAIN_OPERATIONS = %w[SELECT INSERT DELETE].freeze # Operations that start queries and need table names
COLLECTION_OPERATIONS = %w[WITH UPDATE].freeze # Operations that work with existing data and expect table names to follow
TRIGGER_COLLECTION = %w[FROM INTO JOIN IN].freeze # Keywords that signal a table name is coming next
TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze # Database structure operations that create, modify, or remove objects
TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze # Types of database objects that can be created, modified, or removed

class << self
def build_summary_from_tokens(tokens)
summary_parts = []
state = PARSING_STATE
skip_until = 0 # Skip tokens we've already processed when looking ahead

tokens.each_with_index do |token, index|
next if index < skip_until

result = process_token(token, tokens, index, state)

summary_parts.concat(result[:parts])
state = result[:new_state]
skip_until = result[:next_index]
end

summary_parts.join(' ')
end

def process_token(token, tokens, index, state)
operation_result = process_main_operation(token, tokens, index, state)
return operation_result if operation_result[:processed]

collection_result = process_collection_token(token, tokens, index, state)
return collection_result if collection_result[:processed]

{ processed: false, parts: [], new_state: state, next_index: index + 1 }
end

def process_main_operation(token, tokens, index, current_state)
upcased_value = token.value.upcase

case upcased_value
when *MAIN_OPERATIONS
add_to_summary(token.value, PARSING_STATE, index + 1)
when *COLLECTION_OPERATIONS
add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1)
when *TRIGGER_COLLECTION
expect_table_names_next(index + 1)
when *TABLE_OPERATIONS
handle_table_operation(token, tokens, index)
when 'UNION'
handle_union(token, tokens, index)
else
not_processed(current_state, index + 1)
end
end

def process_collection_token(token, tokens, index, state)
return not_processed(state, index + 1) unless state == EXPECT_COLLECTION_STATE

upcased_value = token.value.upcase

if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value))
process_table_name_and_alias(token, tokens, index)
elsif token.value == '(' || token.type == :operator
handle_collection_operator(token, state, index)
else
return_to_normal_parsing(token, index)
end
end

def process_table_name_and_alias(token, tokens, index)
# Look ahead to skip table aliases (e.g., "users u" or "users AS u")
skip_count = calculate_alias_skip(tokens, index)
# Check if there's a comma - if so, expect more table names in the list
new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : PARSING_STATE
skip_count += 1 if tokens[index + 1 + skip_count]&.value == ','

{ processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count }
end

def handle_collection_operator(token, state, index)
{ processed: true, parts: [], new_state: state, next_index: index + 1 }
end

def return_to_normal_parsing(token, index)
{ processed: true, parts: [], new_state: PARSING_STATE, next_index: index + 1 }
end

def identifier_like?(token)
%i[identifier quoted_identifier string].include?(token.type)
end

def can_be_table_name?(upcased_value)
# Object types that can appear after DDL operations
TABLE_OBJECTS.include?(upcased_value)
end

def calculate_alias_skip(tokens, index)
# Handle both "table AS alias" and "table alias" patterns
next_token = tokens[index + 1]
if next_token && next_token.value&.upcase == 'AS'
2
elsif next_token && next_token.type == :identifier
1
else
0
end
end

def add_to_summary(part, new_state, next_index)
{ processed: true, parts: [part], new_state: new_state, next_index: next_index }
end

def expect_table_names_next(next_index)
{ processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index }
end

def not_processed(current_state, next_index)
{ processed: false, parts: [], new_state: current_state, next_index: next_index }
end

def handle_union(token, tokens, index)
next_token = tokens[index + 1]
if next_token && next_token.value&.upcase == 'ALL'
{ processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: PARSING_STATE, next_index: index + 2 }
else
add_to_summary(token.value, PARSING_STATE, index + 1)
end
end

def handle_table_operation(token, tokens, index)
# Combine DDL operations with object types: "CREATE TABLE", "DROP INDEX", etc.
next_token_obj = tokens[index + 1]
next_token = next_token_obj&.value&.upcase

case next_token
when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE'
{ processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 }
else
add_to_summary(token.value, PARSING_STATE, index + 1)
end
end
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# frozen_string_literal: true

# Copyright The OpenTelemetry Authors
#
# SPDX-License-Identifier: Apache-2.0

require 'strscan'

module OpenTelemetry
module Helpers
module QuerySummary
# Tokenizer breaks down SQL queries into structured tokens for analysis.
#
# Parses SQL query strings into typed tokens (keywords, identifiers, operators, literals)
# for generating query summaries while filtering out sensitive data.
#
# @example
# tokens = Tokenizer.tokenize("SELECT * FROM users WHERE id = 1")
# # Returns tokens: [keyword: SELECT], [operator: *], [keyword: FROM], etc.
class Tokenizer
# Token holds the type (e.g., :keyword) and value (e.g., "SELECT")
Token = Struct.new(:type, :value)

# The order of token matching is important for correct parsing,
# as more specific patterns should be matched before more general ones.
TOKEN_REGEX = {
whitespace: /\s+/,
comment: %r{--[^\r\n]*|\/\*.*?\*\/}m,
numeric: /[+-]?(?:0x[0-9a-fA-F]+|\d+\.?\d*(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?)/,
string: /'(?:''|[^'\r\n])*'?/,
quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/,
keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i,
identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/,
operator: %r{<=|>=|<>|!=|[=<>+\-*\/%,;()!?]}
}.freeze

EXCLUDED_TYPES = %i[whitespace comment].freeze

class << self
def tokenize(query)
scanner = StringScanner.new(query)
tokens = []

scan_next_token(scanner, tokens) until scanner.eos?

tokens
end

def scan_next_token(scanner, tokens)
matched = TOKEN_REGEX.any? do |type, regex|
next unless (value = scanner.scan(regex))

tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type)
true
end
scanner.getch unless matched
end
end
end
end
end
end
Loading
Loading