From a30245994c7d809c972451f42ab9c526203aad53 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Jul 2025 12:25:00 +0000 Subject: [PATCH 01/14] analyze_k_impact.py script --- analyze_k_impact.py | 340 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 analyze_k_impact.py diff --git a/analyze_k_impact.py b/analyze_k_impact.py new file mode 100644 index 00000000..84997bd9 --- /dev/null +++ b/analyze_k_impact.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +""" +Script to analyze K value impact on performance for same effective K configurations. + +This script generates graphs showing how different K values affect QPS performance +when they result in the same effective K per shard. +""" + +import json +import os +import glob +import matplotlib.pyplot as plt +from typing import Dict, List, Optional +import math + +# Configuration +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" +OUTPUT_DIR = "./graphs_k_impact" + +def find_summary_files() -> List[str]: + """Find all summary JSON files in the results directory.""" + pattern = os.path.join(RESULTS_DIR, "*-summary.json") + files = glob.glob(pattern) + print(f"Found {len(files)} summary files:") + for file in files: + print(f" - {os.path.basename(file)}") + return files + +def parse_filename(filename: str) -> Dict[str, str]: + """ + Parse filename to extract database configuration info. + Handles both old and new formats with workers. + """ + basename = os.path.basename(filename) + + # Check if it's the new format with workers + if "workers_" in basename: + # New format: ...k_ratio_1.0-workers_8-summary.json + workers_part = basename.split("workers_")[1] # "8-summary.json" + workers = workers_part.split("-")[0] # "8" + else: + # Old format: ...k_ratio_1.0-summary.json + workers = "8" # Default for old files + + # Remove -summary.json suffix + name_parts = basename.replace("-summary.json", "").split("-") + + # Extract key information + parsed = { + "engine": name_parts[0] if len(name_parts) > 0 else "unknown", + "algorithm": name_parts[1] if len(name_parts) > 1 else "unknown", + "filename": basename, + "full_name": basename.replace("-summary.json", ""), + "workers": int(workers) + } + + return parsed + +def load_benchmark_data(file_path: str) -> Dict: + """Load and parse a benchmark summary JSON file.""" + try: + with open(file_path, 'r') as f: + data = json.load(f) + return data + except Exception as e: + print(f"Error loading {file_path}: {e}") + return {} + +def calculate_effective_k(k_ratio, shard_count, k): + """Calculate the effective K per shard.""" + num_shards = shard_count + k_ratio_config = k_ratio + + k_min = math.ceil(k / num_shards) + k_per_shard_req = math.ceil(k * k_ratio_config) + effective_k = max(k_min, k_per_shard_req) + + return effective_k + +def extract_summary_data(data: Dict, filename: str) -> Optional[Dict]: + """Extract summary data from a benchmark JSON file.""" + if 'precision_summary' not in data or 'precision' not in data: + print(f"Warning: No precision_summary or precision found in {filename}") + return None + + # Get k_ratio and shard_count from the precision section + precision_data = data['precision'] + if not precision_data: + print(f"Warning: Empty precision section in {filename}") + return None + + # Get the first precision entry + first_precision_key = list(precision_data.keys())[0] + config = precision_data[first_precision_key].get('config', {}) + + k_ratio = config.get('k_ratio') + shard_count = config.get('shard_count') + workers = parse_filename(filename)['workers'] + + # Get K value from search section + if 'search' not in data or not data['search']: + return None + + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top') + + if k_ratio is None or shard_count is None or k_value is None: + print(f"Warning: Missing required data in {filename}") + return None + + # Calculate effective K + effective_k = calculate_effective_k(k_ratio, shard_count, k_value) + + # Extract performance data from precision_summary + precision_summary = data['precision_summary'] + best_precision = max(precision_summary.keys(), key=float) + qps = precision_summary[best_precision].get('qps', 0) + + return { + 'filename': filename, + 'k_ratio': k_ratio, + 'shard_count': shard_count, + 'workers': workers, + 'k_value': k_value, + 'effective_k': effective_k, + 'qps': qps, + 'precision': float(best_precision) + } + +def organize_data_by_effective_k_and_shards(summaries: List[Dict]) -> Dict[tuple, List[Dict]]: + """Organize summary data by (effective_k, shard_count, workers) combination.""" + by_config = {} + + for summary in summaries: + effective_k = summary['effective_k'] + shard_count = summary['shard_count'] + workers = summary['workers'] + key = (effective_k, shard_count, workers) + + if key not in by_config: + by_config[key] = [] + by_config[key].append(summary) + + return by_config + +def create_k_impact_graph(effective_k: int, shard_count: int, workers: int, summaries: List[Dict], output_dir: str): + """Create a graph showing K value impact on QPS for same effective K configuration.""" + if len(summaries) < 2: + print(f"Skipping graph for effective_k={effective_k}, shards={shard_count}, workers={workers} - need at least 2 K values") + return + + # Sort by K value for proper line plotting + summaries = sorted(summaries, key=lambda x: x['k_value']) + + # Extract data for plotting + k_values = [s['k_value'] for s in summaries] + qps_values = [s['qps'] for s in summaries] + + # Create the plot + fig, ax = plt.subplots(figsize=(10, 6)) + + # Plot QPS vs K values + color = 'tab:blue' + ax.set_xlabel('K Value') + ax.set_ylabel('QPS (Queries Per Second)', color=color) + line = ax.plot(k_values, qps_values, 'o-', color=color, linewidth=2, markersize=8) + ax.tick_params(axis='y', labelcolor=color) + ax.grid(True, alpha=0.3) + + # Add labels for QPS values + for i, (x, y) in enumerate(zip(k_values, qps_values)): + offset_y = 8 if i % 2 == 0 else 12 + ax.annotate(f'{y:.1f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', + fontsize=9, color=color, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color)) + + # Add title + plt.title(f'QPS vs K Value - EffectiveK={effective_k}, {shard_count} Shards, {workers} Workers', + fontsize=14, fontweight='bold') + + # Set x-axis to show all K values + ax.set_xticks(k_values) + ax.set_xticklabels([str(k) for k in k_values]) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"effective_k_{effective_k}_shards_{shard_count}_workers_{workers}_k_impact.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" Graph saved: {filename}") + print(f" K values: {k_values}") + print(f" QPS values: {[f'{qps:.1f}' for qps in qps_values]}") + +def create_combined_k_impact_graph(shard_count: int, workers: int, configs_data: Dict[int, List[Dict]], output_dir: str): + """Create a combined graph with multiple lines, each representing an effective K value.""" + + # Filter configs that have multiple K values + valid_configs = {eff_k: summaries for eff_k, summaries in configs_data.items() if len(summaries) >= 2} + + if not valid_configs: + print(f"Skipping combined graph for {shard_count} shards, {workers} workers - no configs with multiple K values") + return + + # Create the plot + fig, ax = plt.subplots(figsize=(12, 8)) + + # Color palette for different effective K lines + colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray'] + + # Plot each effective K as a separate line + for i, (effective_k, summaries) in enumerate(sorted(valid_configs.items())): + # Sort by K value + summaries = sorted(summaries, key=lambda x: x['k_value']) + + k_values = [s['k_value'] for s in summaries] + qps_values = [s['qps'] for s in summaries] + + color = colors[i % len(colors)] + + # Plot line + ax.plot(k_values, qps_values, 'o-', color=color, linewidth=2, markersize=6, + label=f'Effective K = {effective_k}') + + # Add value labels + for j, (x, y) in enumerate(zip(k_values, qps_values)): + offset_y = 8 if j % 2 == 0 else 12 + ax.annotate(f'{y:.1f}', (x, y), textcoords="offset points", xytext=(0, offset_y), + ha='center', fontsize=8, color=color, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color)) + + # Formatting + ax.set_xlabel('K Value') + ax.set_ylabel('QPS (Queries Per Second)') + ax.set_title(f'QPS vs K Value by Effective K - {shard_count} Shards, {workers} Workers', + fontsize=14, fontweight='bold') + ax.grid(True, alpha=0.3) + ax.legend(loc='best') + + # Set x-axis to show all K values + all_k_values = sorted(set(s['k_value'] for summaries in valid_configs.values() for s in summaries)) + ax.set_xticks(all_k_values) + ax.set_xticklabels([str(k) for k in all_k_values]) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"combined_k_impact_shards_{shard_count}_workers_{workers}.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" Combined graph saved: {filename}") + print(f" Effective K values: {sorted(valid_configs.keys())}") + print(f" K values range: {min(all_k_values)} - {max(all_k_values)}") + +def main(): + """Main function to orchestrate the K impact analysis.""" + print("Starting K Impact Analysis...") + + # Create output directory if it doesn't exist + os.makedirs(OUTPUT_DIR, exist_ok=True) + + print(f"Looking for result files in: {RESULTS_DIR}") + print(f"Graphs will be saved to: {OUTPUT_DIR}") + + # Find and process summary files + summary_files = find_summary_files() + + if not summary_files: + print("No summary files found! Please check the results directory.") + return + + # Process all files and extract summary data + all_summaries = [] + + for file_path in summary_files: + print(f"\nProcessing: {os.path.basename(file_path)}") + data = load_benchmark_data(file_path) + + if data: + summary = extract_summary_data(data, os.path.basename(file_path)) + if summary: + all_summaries.append(summary) + print(f" Extracted: K={summary['k_value']}, EffectiveK={summary['effective_k']}, Shards={summary['shard_count']}, Workers={summary['workers']}") + + print(f"\nTotal valid summaries found: {len(all_summaries)}") + + if not all_summaries: + print("No valid summary data found! Please check your result files.") + return + + # Organize by effective K and shard configuration + by_config = organize_data_by_effective_k_and_shards(all_summaries) + print(f"Data organized by configurations: {len(by_config)} unique configs") + + # Show summary of what we found + for (effective_k, shard_count, workers), summaries in sorted(by_config.items()): + k_values = sorted(set(s['k_value'] for s in summaries)) + print(f" EffectiveK={effective_k}, Shards={shard_count}, Workers={workers}: {len(summaries)} summaries") + print(f" K values: {k_values}") + + # Create individual graphs for each configuration + print(f"\nCreating individual K impact graphs...") + individual_graphs_created = 0 + + for (effective_k, shard_count, workers), summaries in sorted(by_config.items()): + print(f"\nProcessing EffectiveK={effective_k}, Shards={shard_count}, Workers={workers}:") + create_k_impact_graph(effective_k, shard_count, workers, summaries, OUTPUT_DIR) + individual_graphs_created += 1 + + # Create combined graphs by shard count and workers + print(f"\nCreating combined K impact graphs...") + combined_graphs_created = 0 + + # Group by (shard_count, workers) for combined graphs + by_shard_workers = {} + for (effective_k, shard_count, workers), summaries in by_config.items(): + key = (shard_count, workers) + if key not in by_shard_workers: + by_shard_workers[key] = {} + by_shard_workers[key][effective_k] = summaries + + for (shard_count, workers), configs_data in sorted(by_shard_workers.items()): + print(f"\nCreating combined graph for {shard_count} shards, {workers} workers:") + create_combined_k_impact_graph(shard_count, workers, configs_data, OUTPUT_DIR) + combined_graphs_created += 1 + + print(f"\nAnalysis complete!") + print(f"Created {individual_graphs_created} individual graphs") + print(f"Created {combined_graphs_created} combined graphs") + print(f"All graphs saved in: {OUTPUT_DIR}") + +if __name__ == "__main__": + main() From 60d1e7421e7da1d1b2cf57f8cba8274d1c16e2de Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Jul 2025 13:32:55 +0000 Subject: [PATCH 02/14] more files --- analyze_shard_k_ratio.py | 342 ++++++++++++++++++++++++++++++ datasets/datasets.json | 5 +- db_to_port_and_host.md | 23 ++ engine/base_client/client.py | 16 +- engine/clients/redis/config.py | 6 + engine/clients/redis/configure.py | 3 + engine/clients/redis/search.py | 14 +- 7 files changed, 400 insertions(+), 9 deletions(-) create mode 100644 analyze_shard_k_ratio.py create mode 100644 db_to_port_and_host.md diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py new file mode 100644 index 00000000..275f6639 --- /dev/null +++ b/analyze_shard_k_ratio.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +Script to analyze shard_k_ratio feature impact on performance and recall. + +This script generates graphs showing the relationship between shard_k_ratio values +and performance/recall metrics for different database configurations. +""" + +import json +import os +import glob +import matplotlib.pyplot as plt +import numpy as np +from typing import Dict, List, Optional +from pathlib import Path +import math +# Configuration +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" +OUTPUT_DIR = "./graphs" + +def find_summary_files() -> List[str]: + """Find all summary JSON files in the results directory.""" + pattern = os.path.join(RESULTS_DIR, "*-summary.json") + files = glob.glob(pattern) + print(f"Found {len(files)} summary files:") + for file in files: + print(f" - {os.path.basename(file)}") + return files + +def get_wrokers_from_filename(filename: str) -> int: + """ + Parse filename to extract workers. + Expected format: *workers-*-summary.json + """ + basename = os.path.basename(filename) + + # Check if it's the new format with workers + if "workers_" in basename: + # New format: ...k_ratio_1.0-workers_8-summary.json + workers_part = basename.split("workers_")[1] # "8-summary.json" + workers = workers_part.split("-")[0] # "8" + else: + # Old format: ...k_ratio_1.0-summary.json + workers = "8" # or "unknown" - your choice for default + + # Remove -summary.json suffix + name_parts = basename.replace("-summary.json", "").split("-") + + return int(workers) + +def load_benchmark_data(file_path: str) -> Dict: + """Load and parse a benchmark summary JSON file.""" + try: + with open(file_path, 'r') as f: + data = json.load(f) + return data + except Exception as e: + print(f"Error loading {file_path}: {e}") + return {} + +def calculate_effective_k(k_ratio, shard_count, k): + num_shards = shard_count + k_ratio_config = k_ratio # The configured k_ratio + + k_min = math.ceil(k / num_shards) + k_per_shard_req = math.ceil(k * k_ratio_config) + effective_k = max(k_min, k_per_shard_req) + + return effective_k + +def calculatr_actual_k_ratio(k_ratio, shard_count, k): + effective_k = calculate_effective_k(k_ratio, shard_count, k) + k_ratio_actual = effective_k / k + + return k_ratio_actual + +def extract_summary_data(data: Dict, filename: str) -> Optional[Dict]: + """Extract summary data from a benchmark JSON file.""" + if 'precision_summary' not in data or 'precision' not in data: + print(f"Warning: No precision_summary or precision found in {filename}") + return None + + # Get k_ratio and shard_count from the precision section + precision_data = data['precision'] + if not precision_data: + print(f"Warning: Empty precision section in {filename}") + return None + + # Get the first precision entry (they should all have the same k_ratio and shard_count) + first_precision_key = list(precision_data.keys())[0] + config = precision_data[first_precision_key].get('config', {}) + + k_ratio = config.get('k_ratio', 'unknown') + shard_count = config.get('shard_count', 'unknown') + workers = get_wrokers_from_filename(filename) + + if k_ratio == 'unknown' or shard_count == 'unknown': + print(f"Warning: Missing k_ratio or shard_count in {filename}") + print(f" Found k_ratio: {k_ratio}, shard_count: {shard_count}") + return None + + # Get K value from the search section (since it's not in config) + k_value = 'unknown' + if 'search' in data and data['search']: + # Get the first search experiment to extract the 'top' value + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top', 'unknown') + + # Extract performance data from precision_summary + precision_summary = data['precision_summary'] + + # Get the best precision point (highest precision available) + best_precision = max(precision_summary.keys(), key=float) + best_data = precision_summary[best_precision] + + return { + 'filename': filename, + 'k_ratio': calculatr_actual_k_ratio(k_ratio, shard_count, k_value), + 'shard_count': shard_count, + 'workers': workers, + 'k_value': k_value, + 'precision': float(best_precision), + 'qps': best_data.get('qps', 0), + 'p50': best_data.get('p50', 0), + 'p95': best_data.get('p95', 0), + } + +def organize_data_by_k_and_shards(summaries: List[Dict]) -> Dict[tuple, List[Dict]]: + """Organize summary data by (k_value, shard_count) combination.""" + by_k_and_shards = {} + + for summary in summaries: + k_value = summary['k_value'] + shard_count = summary['shard_count'] + workers = summary['workers'] + key = (k_value, shard_count, workers) + + if key not in by_k_and_shards: + by_k_and_shards[key] = [] + by_k_and_shards[key].append(summary) + + return by_k_and_shards + +def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output_dir: str): + """Create a graph for a specific shard count showing ratio vs performance/recall.""" + if not summaries: + print(f"No data for shard count {shard_count}") + return + + # Sort by k_ratio for proper line plotting + summaries = sorted(summaries, key=lambda x: x['k_ratio']) + + # Extract data for plotting + effective_k_values = [calculate_effective_k(s['k_ratio'], shard_count, s['k_value']) for s in summaries] + ratios = [s['k_ratio'] for s in summaries] + qps_values = [s['qps'] for s in summaries] + precision_values = [s['precision'] for s in summaries] + + # Get workers (should be the same for all experiments in this shard count) + workers = set(s['workers'] for s in summaries) + workers = list(workers)[0] if len(workers) == 1 else f"Workers={sorted(workers)}" + + # Get K value (should be the same for all experiments in this shard count) + k_values = set(s.get('k_value', 'unknown') for s in summaries) + k_value = list(k_values)[0] if len(k_values) == 1 else f"K={sorted(k_values)}" + + # Create the plot + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plot QPS (performance) on left y-axis + color1 = 'tab:blue' + ax1.set_xlabel('Effective K (Ratio)') + ax1.set_ylabel('QPS (Queries Per Second)', color=color1) + line1 = ax1.plot(effective_k_values, qps_values, 'o-', color=color1, label='QPS', linewidth=2, markersize=6) + ax1.tick_params(axis='y', labelcolor=color1) + ax1.grid(True, alpha=0.3) + + # Create consistent x-axis with fixed intervals from lowest ratio to 1.0 + min_ratio = min(ratios) + # Create x-axis values from min_ratio to 1.0 in 0.1 increments + x_axis_ratios = [] + current_ratio = min_ratio + while current_ratio <= 1.0: + x_axis_ratios.append(round(current_ratio, 1)) + current_ratio += 0.1 + + # Calculate corresponding effective K values for x-axis + x_axis_effective_k = [calculate_effective_k(ratio, shard_count, k_value) for ratio in x_axis_ratios] + + # Create x-axis labels + x_labels = [f'{int(eff_k)}\n({ratio:.1f})' for eff_k, ratio in zip(x_axis_effective_k, x_axis_ratios)] + + # Set consistent x-axis + ax1.set_xticks(x_axis_effective_k) + ax1.set_xticklabels(x_labels, rotation=0, ha='center') + ax1.set_xlim(min(x_axis_effective_k) * 0.95, max(x_axis_effective_k) * 1.05) + + # Create second y-axis for precision (recall) + ax2 = ax1.twinx() + color2 = 'tab:red' + ax2.set_ylabel('Precision (Recall)', color=color2) + line2 = ax2.plot(effective_k_values, precision_values, 's-', color=color2, label='Precision', linewidth=2, markersize=6) + ax2.tick_params(axis='y', labelcolor=color2) + + # Format precision y-axis to show max 3 decimal places + ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.3f}')) + + # Add labels for QPS values (blue line) - positioned closer to points + for i, (x, y) in enumerate(zip(effective_k_values, qps_values)): + # Alternate positioning: odd indices go slightly higher to avoid overlap + offset_y = 8 if i % 2 == 0 else 12 + ax1.annotate(f'{y:.1f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', + fontsize=8, color=color1, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color1)) + + # Add labels for Precision values (red line) - positioned closer below points + for i, (x, y) in enumerate(zip(effective_k_values, precision_values)): + # Alternate positioning: odd indices go slightly lower to avoid overlap + offset_y = -12 if i % 2 == 0 else -16 + ax2.annotate(f'{y:.2f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', + fontsize=8, color=color2, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color2)) + + # Add selective improvement annotations (Option C) + if len(summaries) >= 2: + # Find the point with ratio 1.0 (baseline) and lowest ratio + baseline_idx = None + lowest_ratio_idx = 0 + + for i, ratio in enumerate(ratios): + if abs(ratio - 1.0) < 0.01: # ratio ≈ 1.0 + baseline_idx = i + if ratios[i] < ratios[lowest_ratio_idx]: + lowest_ratio_idx = i + + # Add annotation if we have both baseline and lowest ratio points, and they're different + if baseline_idx is not None and lowest_ratio_idx != baseline_idx: + baseline_qps = qps_values[baseline_idx] + lowest_qps = qps_values[lowest_ratio_idx] + baseline_precision = precision_values[baseline_idx] + lowest_precision = precision_values[lowest_ratio_idx] + + qps_improvement = ((lowest_qps - baseline_qps) / baseline_qps) * 100 + precision_change = ((lowest_precision - baseline_precision) / baseline_precision) * 100 + + # Option A: Place annotation outside graph area, bottom-left corner + lowest_ratio = ratios[lowest_ratio_idx] + annotation_text = f"Ratio: {lowest_ratio:.1f} vs 1.0\n{qps_improvement:+.0f}% QPS\n{precision_change:+.1f}% Precision" + + # Position outside the plot area in bottom-left, below x-axis labels and legend + ax1.text(0.02, -0.13, annotation_text, transform=ax1.transAxes, + fontsize=12, ha='left', va='top', + bbox=dict(boxstyle="round,pad=0.4", facecolor='lightyellow', alpha=0.9, edgecolor='orange')) + + # Add title and legend + plt.title(f'Performance vs EffectiveK - {shard_count} Shards (K={k_value})', fontsize=14, fontweight='bold') + + # Create proper legend below x-axis, centered in one line + lines = line1 + line2 + labels = [l.get_label() for l in lines] + ax1.legend(lines, labels, bbox_to_anchor=(0.5, -0.15), loc='upper center', + ncol=2, frameon=False, fontsize=10) + + # Adjust plot margins to prevent label truncation + plt.subplots_adjust(top=0.85, bottom=0.15) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"shard_count_{shard_count}_k_{k_value}_workers_{workers}_performance.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" Graph saved: {filename}") + + # Print data summary + print(f" Data points: {len(summaries)}") + print(f" Effective K values: {effective_k_values}") + print(f" QPS range: {min(qps_values):.1f} - {max(qps_values):.1f}") + print(f" Precision range: {min(precision_values):.3f} - {max(precision_values):.3f}") + +def main(): + """Main function to orchestrate the analysis.""" + print("Starting shard_k_ratio analysis...") + + # Create output directory if it doesn't exist + os.makedirs(OUTPUT_DIR, exist_ok=True) + + print(f"Looking for result files in: {RESULTS_DIR}") + print(f"Graphs will be saved to: {OUTPUT_DIR}") + + # Find and parse summary files + summary_files = find_summary_files() + + if not summary_files: + print("No summary files found! Please check the results directory.") + return + + # Process all files and extract summary data + all_summaries = [] + + for file_path in summary_files: + print(f"\nProcessing: {os.path.basename(file_path)}") + data = load_benchmark_data(file_path) + + if data: + summary = extract_summary_data(data, os.path.basename(file_path)) + if summary: + all_summaries.append(summary) + print(f" Extracted summary: k_ratio={summary['k_ratio']}, shard_count={summary['shard_count']}, precision={summary['precision']:.3f}, workers={summary['workers']}") + + print(f"\nTotal valid summaries found: {len(all_summaries)}") + + if not all_summaries: + print("No valid summary data found! Please check your result files.") + return + + # Organize by shard count + by_k_and_shards = organize_data_by_k_and_shards(all_summaries) + print(f"Data organized by shard counts: {sorted(by_k_and_shards.keys())}") + + # Show summary of what we found + for (k_value, shard_count, workers) in sorted(by_k_and_shards.keys()): + summaries = by_k_and_shards[(k_value, shard_count, workers)] + ratios = sorted(set(s['k_ratio'] for s in summaries)) + print(f" K={k_value}, Shard count {shard_count}: {len(summaries)} summaries") + print(f" K ratios found: {ratios}") + + # Create graphs for each shard count + for (k_value, shard_count, workers) in sorted(by_k_and_shards.keys()): + summaries = by_k_and_shards[(k_value, shard_count, workers)] + print(f"\nCreating graph for K={k_value}, {shard_count} shards, {workers} workers:") + create_graph_for_shard_count(shard_count, summaries, OUTPUT_DIR) + + print(f"\nAll graphs saved to: {OUTPUT_DIR}") + print("Analysis complete!") + +if __name__ == "__main__": + main() diff --git a/datasets/datasets.json b/datasets/datasets.json index 5493502e..7b5fbabe 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -1048,7 +1048,8 @@ "schema": { "update_date_ts": "int", "labels": "keyword", - "submitter": "keyword" + "submitter": "keyword", + "abstract": "text" }, "vector_count": 2205995, "description": "Academic paper embeddings" @@ -1297,4 +1298,4 @@ "vector_count": 100, "description": "Synthetic data" } -] \ No newline at end of file +] diff --git a/db_to_port_and_host.md b/db_to_port_and_host.md new file mode 100644 index 00000000..c1e4a72a --- /dev/null +++ b/db_to_port_and_host.md @@ -0,0 +1,23 @@ +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=14941 host=54.78.191.248 +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=12041 host=18.203.186.188 +SHARD_COUNT=2 REDIS_PORT=12558 host=54.78.191.248 +SHARD_COUNT=1 REDIS_PORT=16556 host=18.203.186.188 +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 host=18.203.186.188 # WITH FILTERS +WORKERS=8 SHARD_COUNT=1 REDIS_PORT=13833 host=54.78.191.248 # WITH FILTERS +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18447 host=54.78.191.248 # WITH FILTERS +WORKERS=8 SHARD_COUNT=2 REDIS_PORT=19459 host=18.203.186.188 # WITH FILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 host=54.78.191.248 # WITH FILTERS WITH QPF + + +SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 + +# upload NOFILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=11926 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-search +# upload FILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search +#search NOFILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19387 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 +# search FILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 K_RATIO=1.0 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --queries 5000 +# downloaddataset FILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --skip-search --queries 5000 diff --git a/engine/base_client/client.py b/engine/base_client/client.py index 761f148e..17bfdeb4 100644 --- a/engine/base_client/client.py +++ b/engine/base_client/client.py @@ -12,6 +12,11 @@ from engine.base_client.distances import Distance from engine.base_client.search import BaseSearcher from engine.base_client.upload import BaseUploader +from engine.clients.redis.config import ( + K_RATIO, + SHARD_COUNT, + WORKERS, +) RESULTS_DIR = ROOT_DIR / "results" RESULTS_DIR.mkdir(exist_ok=True) @@ -72,7 +77,10 @@ def analyze_precision_performance(search_results: Dict[str, Any]) -> tuple[Dict[ best_rps = rps best_config = { "parallel": experiment_data["params"]["parallel"], - "search_params": experiment_data["params"]["search_params"] + "search_params": experiment_data["params"]["search_params"], + "k_ratio": K_RATIO, + "shard_count": SHARD_COUNT, + "workers": WORKERS, } best_experiment_id = experiment_id best_p50_time = experiment_data["results"]["p50_time"] @@ -303,7 +311,9 @@ def run_experiment( print("Experiment stage: Done") # Add precision analysis if search results exist + top = 0 if results["search"]: + top = results["search"][list(results["search"].keys())[0]]["params"]["top"] precision_analysis, precision_summary = analyze_precision_performance(results["search"]) if precision_analysis: # Only add if we have precision data results["precision"] = precision_analysis @@ -314,8 +324,8 @@ def run_experiment( # Display results table and chart self._display_results_summary(precision_summary, dataset.config.name) - summary_file = f"{self.name}-{dataset.config.name}-summary.json" - summary_path = RESULTS_DIR / summary_file + summary_file = f"{self.name}-{dataset.config.name}-k_{top}-shards_{SHARD_COUNT}_k_ratio_{K_RATIO}-workers_{WORKERS}-summary.json" + summary_path = RESULTS_DIR / "final" / summary_file with open(summary_path, "w") as out: out.write( json.dumps( diff --git a/engine/clients/redis/config.py b/engine/clients/redis/config.py index 2bf68fc2..158ce2e8 100644 --- a/engine/clients/redis/config.py +++ b/engine/clients/redis/config.py @@ -4,6 +4,12 @@ REDIS_AUTH = os.getenv("REDIS_AUTH", None) REDIS_USER = os.getenv("REDIS_USER", None) REDIS_CLUSTER = bool(int(os.getenv("REDIS_CLUSTER", 0))) +SHARD_COUNT = int(os.getenv("SHARD_COUNT", 1)) +WORKERS = int(os.getenv("WORKERS", 8)) +K_RATIO = float(os.getenv("K_RATIO", 1.0)) + + + # One of BATCHES and ADHOC_BF # check https://redis.io/docs/latest/develop/interact/search-and-query/advanced-concepts/vectors/#pre-filter-query-attributes-hybrid-approach REDIS_HYBRID_POLICY = os.getenv("REDIS_HYBRID_POLICY", "") diff --git a/engine/clients/redis/configure.py b/engine/clients/redis/configure.py index 20544c3d..e892d85b 100644 --- a/engine/clients/redis/configure.py +++ b/engine/clients/redis/configure.py @@ -18,6 +18,8 @@ REDIS_USER, REDIS_CLUSTER, REDIS_KEEP_DOCUMENTS, + SHARD_COUNT, + K_RATIO ) @@ -98,6 +100,7 @@ def recreate(self, dataset: Dataset, collection_params): data_type = collection_params.get("data_type", "float32") algorithm_config = collection_params.get(f"{algo}_config", {}) print(f"Using algorithm {algo} with config {algorithm_config}") + print(f"shard count {SHARD_COUNT} with k_ratio {K_RATIO}") index_fields = [ VectorField( name="vector", diff --git a/engine/clients/redis/search.py b/engine/clients/redis/search.py index 31cf27bd..579f2090 100644 --- a/engine/clients/redis/search.py +++ b/engine/clients/redis/search.py @@ -12,6 +12,8 @@ REDIS_USER, REDIS_CLUSTER, REDIS_HYBRID_POLICY, + K_RATIO + ) from engine.clients.redis.parser import RedisConditionParser @@ -75,20 +77,24 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: else: prefilter_condition, params = conditions + shard_k_ratio = K_RATIO + shard_k_ratio_policy = f"{{$shard_k_ratio: {shard_k_ratio}}}" + prefilter_condition = "*" q = ( Query( - f"{prefilter_condition}=>[KNN $K @vector $vec_param {cls.knn_conditions} AS vector_score]{hybrid_policy}" + f"{prefilter_condition}=>[KNN $K @vector $vec_param {cls.knn_conditions} AS vector_score]{hybrid_policy}=>{shard_k_ratio_policy}" ) .sort_by("vector_score", asc=True) .paging(0, top) - .return_fields("vector_score") + # .return_fields("vector_score", "abstract") # performance is optimized for sorting operations on DIALECT 4 in different scenarios. # check SORTBY details in https://redis.io/commands/ft.search/ .dialect(4) - .timeout(REDIS_QUERY_TIMEOUT) + .timeout(0) ) params_dict = { - "vec_param": np.array(vector).astype(cls.np_data_type).tobytes(), + # "vec_param": np.array(vector).astype(cls.np_data_type).tobytes(), + "vec_param": vector, "K": top, **params, } From c90bfe1100d7288917b43f1867a47534295cc41b Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Jul 2025 13:33:34 +0000 Subject: [PATCH 03/14] more --- db_to_port_and_host.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/db_to_port_and_host.md b/db_to_port_and_host.md index c1e4a72a..c47a0c73 100644 --- a/db_to_port_and_host.md +++ b/db_to_port_and_host.md @@ -1,12 +1,7 @@ -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=14941 host=54.78.191.248 -WORKERS=8 SHARD_COUNT=5 REDIS_PORT=12041 host=18.203.186.188 -SHARD_COUNT=2 REDIS_PORT=12558 host=54.78.191.248 -SHARD_COUNT=1 REDIS_PORT=16556 host=18.203.186.188 -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 host=18.203.186.188 # WITH FILTERS -WORKERS=8 SHARD_COUNT=1 REDIS_PORT=13833 host=54.78.191.248 # WITH FILTERS -WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18447 host=54.78.191.248 # WITH FILTERS -WORKERS=8 SHARD_COUNT=2 REDIS_PORT=19459 host=18.203.186.188 # WITH FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 host=54.78.191.248 # WITH FILTERS WITH QPF +HOST1: 18.200.246.132 +HOST2: 54.195.17.70 + +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 host=18.200.246.132 # WITH FILTERS WITH QPF SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 @@ -14,8 +9,8 @@ SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 # upload NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=11926 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-search # upload FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search -#search NOFILTERS +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search +# search NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19387 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 # search FILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 K_RATIO=1.0 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --queries 5000 From f77a432a92926a6e37f835dacc0923bca7c8620d Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Jul 2025 13:33:47 +0000 Subject: [PATCH 04/14] fdfd --- experiments/configurations/redis-hnsw-single-node.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/configurations/redis-hnsw-single-node.json b/experiments/configurations/redis-hnsw-single-node.json index 9ae5e233..28b964fd 100644 --- a/experiments/configurations/redis-hnsw-single-node.json +++ b/experiments/configurations/redis-hnsw-single-node.json @@ -22,7 +22,7 @@ }, "search_params": [ { "parallel": 1, "search_params": { "ef": 64 } }, { "parallel": 1, "search_params": { "ef": 128 } }, { "parallel": 1, "search_params": { "ef": 256 } }, { "parallel": 1, "search_params": { "ef": 512 } }, - { "parallel": 100, "search_params": { "ef": 64 } }, { "parallel": 100, "search_params": { "ef": 128 } }, { "parallel": 100, "search_params": { "ef": 256 } }, { "parallel": 100, "search_params": { "ef": 512 } } + { "parallel": 100, "top": 5000, "search_params": { "ef": 128 }} ], "upload_params": { "parallel": 16 } }, From 6330537c2bdde75a37617efc85be07b10d29f45e Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Jul 2025 15:37:44 +0000 Subject: [PATCH 05/14] update --- dataset_reader/ann_compound_reader.py | 2 +- extract_table_data.py | 93 +++++++++++++++++++++++++ generate_baseline.py | 98 +++++++++++++++++++++++++++ 3 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 extract_table_data.py create mode 100644 generate_baseline.py diff --git a/dataset_reader/ann_compound_reader.py b/dataset_reader/ann_compound_reader.py index a2b03301..3fd96253 100644 --- a/dataset_reader/ann_compound_reader.py +++ b/dataset_reader/ann_compound_reader.py @@ -15,7 +15,7 @@ class AnnCompoundReader(JSONReader): """ VECTORS_FILE = "vectors.npy" - QUERIES_FILE = "tests.jsonl" + QUERIES_FILE = "tests_baseline_k5000_1_shards.jsonl" def read_vectors(self) -> Iterator[List[float]]: vectors = np.load(self.path / self.VECTORS_FILE) diff --git a/extract_table_data.py b/extract_table_data.py new file mode 100644 index 00000000..a847b554 --- /dev/null +++ b/extract_table_data.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +""" +Extract raw data table from benchmark results for Confluence. +""" + +import json +import os +import glob +import math + +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" + +def extract_data_from_file(file_path): + """Extract data from a single summary file.""" + try: + with open(file_path, 'r') as f: + data = json.load(f) + + # Get data from precision section + if 'precision' not in data or not data['precision']: + return None + + first_precision_key = list(data['precision'].keys())[0] + config = data['precision'][first_precision_key].get('config', {}) + + k_ratio_config = config.get('k_ratio') + shard_count = config.get('shard_count') + + # Get K from search section + if 'search' not in data or not data['search']: + return None + + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top') + + # Get QPS from precision_summary + if 'precision_summary' not in data: + return None + + precision_summary = data['precision_summary'] + best_precision = max(precision_summary.keys(), key=float) + qps = precision_summary[best_precision].get('qps', 0) + + # Calculate effective K + k_min = math.ceil(k_value / shard_count) + k_per_shard_req = math.ceil(k_value * k_ratio_config) + effective_k = max(k_min, k_per_shard_req) + actual_ratio = effective_k / k_value + + return { + 'shard_count': shard_count, + 'k': k_value, + 'effective_k': effective_k, + 'ratio': actual_ratio, + 'qps': qps, + 'filename': os.path.basename(file_path) + } + + except Exception as e: + print(f"Error processing {file_path}: {e}") + return None + +def main(): + """Extract all data and create table.""" + pattern = os.path.join(RESULTS_DIR, "*-summary.json") + files = glob.glob(pattern) + + print(f"Found {len(files)} files") + + all_data = [] + for file_path in files: + data = extract_data_from_file(file_path) + if data: + all_data.append(data) + + # Sort by shard count, then K, then effective K + all_data.sort(key=lambda x: (x['shard_count'], x['k'], x['effective_k'])) + + print("\n" + "="*70) + print("RAW DATA TABLE FOR CONFLUENCE:") + print("="*70) + print("| Shard Count | K | Effective K | Ratio | QPS |") + print("|-------------|---|-------------|-------|-----|") + + for row in all_data: + print(f"| {row['shard_count']} | {row['k']} | {row['effective_k']} | {row['ratio']:.2f} | {row['qps']:.1f} |") + + print("="*70) + print(f"\nTotal rows: {len(all_data)}") + +if __name__ == "__main__": + main() diff --git a/generate_baseline.py b/generate_baseline.py new file mode 100644 index 00000000..77bf90a0 --- /dev/null +++ b/generate_baseline.py @@ -0,0 +1,98 @@ +import redis +import json +import numpy as np + +MAX_INT = 10000000000000000000 +# Connect to Redis +r = redis.Redis(host='54.78.191.248', port=13833, decode_responses=False, protocol=3) + +queries_folder = '/home/ubuntu/vector-db-benchmark/datasets/arxiv-titles-384-angular/arxiv' +queries_file = f'{queries_folder}/tests.jsonl' + +# Load one query from tests.jsonl to test +with open(queries_file) as f: + first_query = json.loads(f.readline()) + +print(f"Original query has {len(first_query['closest_ids'])} expected results") + +k = 5000 +num_shards = 1 + +# Perform search with k=1000 and shard_k_ratio=1.0 +query_vector = first_query['query'] +search_results = r.execute_command( + 'FT.SEARCH', 'idx', + f'*=>[KNN {k} @vector $query_vec AS score]', + 'PARAMS', '2', 'query_vec', np.array(query_vector, dtype=np.float32).tobytes(), + 'SORTBY', 'score', + 'RETURN', '1', 'score', + 'LIMIT', '0', f'{k}', + 'DIALECT', '4' +) + +results = search_results[b'results'] +results_count = len(results) +assert results_count == k, f"Expected {k} results, got {results_count}" +print(f"example search result: {results[1]}") +print(f"Search returned {results_count} results") + +# Extract document IDs from search results (RESP3 format) +baseline_ids = [] +for result in results: + doc_id = result[b'id'] + baseline_ids.append(int(doc_id)) + +print(f"Extracted {len(baseline_ids)} baseline IDs") +print(f"First 10 IDs: {baseline_ids[:10]}") + +def generate_baseline_for_queries(num_queries=MAX_INT): + baseline_queries = [] + + with open(queries_file) as f: + for line_num, line in enumerate(f): + query = json.loads(line) + if line_num >= num_queries: + break + if line_num % 100 == 0: + print(f"Processing query {line_num + 1}...") + + # Perform search for this query + query_vector = query['query'] + search_results = r.execute_command( + 'FT.SEARCH', 'idx', + f'*=>[KNN {k} @vector $query_vec AS score]', + 'PARAMS', '2', 'query_vec', np.array(query_vector, dtype=np.float32).tobytes(), + 'SORTBY', 'score', + 'RETURN', '1', 'score', + 'LIMIT', '0', f'{k}', + 'DIALECT', '4' + ) + + # Extract IDs from this search + results = search_results[b'results'] + baseline_ids = [] + baseline_scores = [] + for result in results: + doc_id = result[b'id'] + score = float(result[b'extra_attributes'][b'score']) + baseline_ids.append(int(doc_id)) + baseline_scores.append(score) + + # Create new query with baseline results + new_query = { + 'query': query['query'], + 'closest_ids': baseline_ids, + 'closest_scores': baseline_scores, + 'conditions': query['conditions'] + } + baseline_queries.append(new_query) + + # Save baseline queries to new file + output_file = f'{queries_folder}/tests_baseline_k{k}_{num_shards}_shards.jsonl' + with open(output_file, 'w') as f: + for query in baseline_queries: + f.write(json.dumps(query) + '\n') + + print(f"Saved {len(baseline_queries)} baseline queries to {output_file}") +# +generate_baseline_for_queries(num_queries=5000) From 60a2ce9780f731fa6413a08e66a64de0e6a249ce Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 31 Jul 2025 05:16:13 +0000 Subject: [PATCH 06/14] update --- datasets/datasets.json | 8 +++++++- db_to_port_and_host.md => db_to_port_and_host | 12 +++++++++--- engine/clients/redis/search.py | 2 +- .../configurations/redis-hnsw-single-node.json | 2 +- 4 files changed, 18 insertions(+), 6 deletions(-) rename db_to_port_and_host.md => db_to_port_and_host (55%) diff --git a/datasets/datasets.json b/datasets/datasets.json index 7b5fbabe..be9d94cd 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -1049,7 +1049,13 @@ "update_date_ts": "int", "labels": "keyword", "submitter": "keyword", - "abstract": "text" + "abstract": "text", + "update_date": "int", + "categories": "keyword", + "authors": "keyword", + "comments": "text", + "title": "text", + "id": "keyword" }, "vector_count": 2205995, "description": "Academic paper embeddings" diff --git a/db_to_port_and_host.md b/db_to_port_and_host similarity index 55% rename from db_to_port_and_host.md rename to db_to_port_and_host index c47a0c73..1a1ba0ec 100644 --- a/db_to_port_and_host.md +++ b/db_to_port_and_host @@ -1,7 +1,10 @@ HOST1: 18.200.246.132 HOST2: 54.195.17.70 -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 host=18.200.246.132 # WITH FILTERS WITH QPF +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 host=18.200.246.132 # WITH FILTERS WITH QPF NOT WORKING-DELETED + +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16406 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters! +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18861 host=18.200.246.13218861 # WITH FILTERS WITH QPF ALL filters! SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 @@ -9,10 +12,13 @@ SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 # upload NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=11926 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-search # upload FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16406 python3 run.py --host 54.195.17.70 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search # search NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19387 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 # search FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15194 K_RATIO=1.0 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --queries 5000 +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 K_RATIO=1.0 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 300 --skip-upload --queries 5000 # downloaddataset FILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --skip-search --queries 5000 + + +scp -i pri_key ubuntu@18.200.246.132:/home/ubuntu/vector-db-benchmark-1/datasets/arxiv-titles-384-angular/arxiv/tests_baseline_k5000_1_shards.jsonl datasets/arxiv-titles-384-angular/arxiv/ diff --git a/engine/clients/redis/search.py b/engine/clients/redis/search.py index 579f2090..4bb09c66 100644 --- a/engine/clients/redis/search.py +++ b/engine/clients/redis/search.py @@ -86,7 +86,7 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: ) .sort_by("vector_score", asc=True) .paging(0, top) - # .return_fields("vector_score", "abstract") + .return_fields("vector_score", "abstract", "vector") # performance is optimized for sorting operations on DIALECT 4 in different scenarios. # check SORTBY details in https://redis.io/commands/ft.search/ .dialect(4) diff --git a/experiments/configurations/redis-hnsw-single-node.json b/experiments/configurations/redis-hnsw-single-node.json index 28b964fd..c98563b7 100644 --- a/experiments/configurations/redis-hnsw-single-node.json +++ b/experiments/configurations/redis-hnsw-single-node.json @@ -22,7 +22,7 @@ }, "search_params": [ { "parallel": 1, "search_params": { "ef": 64 } }, { "parallel": 1, "search_params": { "ef": 128 } }, { "parallel": 1, "search_params": { "ef": 256 } }, { "parallel": 1, "search_params": { "ef": 512 } }, - { "parallel": 100, "top": 5000, "search_params": { "ef": 128 }} + { "parallel": 300, "top": 5000, "search_params": { "ef": 128 }} ], "upload_params": { "parallel": 16 } }, From bfbdd4217408f8633310787650e64bbb00e2a282 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 31 Jul 2025 07:43:53 +0000 Subject: [PATCH 07/14] update dataset --- datasets/datasets.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datasets/datasets.json b/datasets/datasets.json index be9d94cd..0418f580 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -1050,12 +1050,10 @@ "labels": "keyword", "submitter": "keyword", "abstract": "text", - "update_date": "int", "categories": "keyword", "authors": "keyword", "comments": "text", - "title": "text", - "id": "keyword" + "title": "text" }, "vector_count": 2205995, "description": "Academic paper embeddings" From db289a391df0a0fba961ad14f53f5f6e9efc97de Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 31 Jul 2025 07:53:53 +0000 Subject: [PATCH 08/14] update db --- db_to_port_and_host | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/db_to_port_and_host b/db_to_port_and_host index 1a1ba0ec..c18950b4 100644 --- a/db_to_port_and_host +++ b/db_to_port_and_host @@ -2,9 +2,12 @@ HOST1: 18.200.246.132 HOST2: 54.195.17.70 WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 host=18.200.246.132 # WITH FILTERS WITH QPF NOT WORKING-DELETED +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16406 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters!-DELETED +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18861 host=18.200.246.13218861 # WITH FILTERS WITH QPF ALL filters!-DELETED -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16406 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters! -WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18861 host=18.200.246.13218861 # WITH FILTERS WITH QPF ALL filters! + +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19283 host=18.200.246.132 # WITH FILTERS WITH QPF ALL filters +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=13730 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 @@ -12,11 +15,11 @@ SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 # upload NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=11926 python3 run.py --host 54.78.191.248 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-search # upload FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16406 python3 run.py --host 54.195.17.70 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=13730 python3 run.py --host 54.195.17.70 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-search # search NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19387 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 # search FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=15045 K_RATIO=1.0 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 300 --skip-upload --queries 5000 +WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19283 K_RATIO=1.0 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 300 --skip-upload --queries 5000 # downloaddataset FILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --skip-search --queries 5000 From 3fac8b4284225aae74790d42f020e54a785edbc0 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 31 Jul 2025 10:27:07 +0000 Subject: [PATCH 09/14] update --- analyze_k_impact.py | 2 +- analyze_shard_k_ratio.py | 2 +- engine/clients/redis/search.py | 4 +- extract_benchmark_table.py | 180 +++++++++++++++++++++++++++++++++ generate_baseline.py | 43 ++++++-- 5 files changed, 217 insertions(+), 14 deletions(-) create mode 100644 extract_benchmark_table.py diff --git a/analyze_k_impact.py b/analyze_k_impact.py index 84997bd9..8e3bb67b 100644 --- a/analyze_k_impact.py +++ b/analyze_k_impact.py @@ -14,7 +14,7 @@ import math # Configuration -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" OUTPUT_DIR = "./graphs_k_impact" def find_summary_files() -> List[str]: diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py index 275f6639..8da5ef7e 100644 --- a/analyze_shard_k_ratio.py +++ b/analyze_shard_k_ratio.py @@ -15,7 +15,7 @@ from pathlib import Path import math # Configuration -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" OUTPUT_DIR = "./graphs" def find_summary_files() -> List[str]: diff --git a/engine/clients/redis/search.py b/engine/clients/redis/search.py index 4bb09c66..976924c5 100644 --- a/engine/clients/redis/search.py +++ b/engine/clients/redis/search.py @@ -86,10 +86,10 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: ) .sort_by("vector_score", asc=True) .paging(0, top) - .return_fields("vector_score", "abstract", "vector") + .return_fields("vector_score", "abstract", "vector","comments", "labels", "authors", "update_date_ts") # performance is optimized for sorting operations on DIALECT 4 in different scenarios. # check SORTBY details in https://redis.io/commands/ft.search/ - .dialect(4) + .dialect(2) .timeout(0) ) params_dict = { diff --git a/extract_benchmark_table.py b/extract_benchmark_table.py new file mode 100644 index 00000000..a4a89fd5 --- /dev/null +++ b/extract_benchmark_table.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Extract benchmark results table from final summary files. +""" + +import os +import json +import glob +import re +from datetime import datetime + +def list_final_files(): + """List all final summary files and print basic info.""" + final_dir = "results/final" + + if not os.path.exists(final_dir): + print(f"Error: {final_dir} directory not found") + return [] + + # Get all JSON files in final directory + pattern = os.path.join(final_dir, "*.json") + files = glob.glob(pattern) + + print(f"Found {len(files)} final summary files:") + for i, file_path in enumerate(files, 1): + filename = os.path.basename(file_path) + print(f"{i:2d}. {filename}") + + return files + +def examine_file_structure(file_path): + """Examine the structure of a single final file.""" + print(f"\n=== Examining: {os.path.basename(file_path)} ===") + + try: + with open(file_path, 'r') as f: + data = json.load(f) + + print("Top-level keys:", list(data.keys())) + + # Check precision section + if 'precision' in data: + precision_keys = list(data['precision'].keys()) + print(f"Precision levels: {precision_keys}") + + if precision_keys: + first_precision = precision_keys[0] + config = data['precision'][first_precision].get('config', {}) + print(f"Config keys: {list(config.keys())}") + print(f"Sample config: {config}") + + # Check search section for referenced files + if 'search' in data: + search_keys = list(data['search'].keys()) + print(f"Number of search results: {len(search_keys)}") + if search_keys: + print(f"Sample search key: {search_keys[0]}") + + except Exception as e: + print(f"Error reading file: {e}") + +def parse_timestamp_from_filename(filename): + """Extract date and time from a filename with timestamp.""" + # Pattern: redis-hnsw-m-16-ef-128-arxiv-titles-384-angular-filters-search-4-403801-2025-07-31-09-44-01 + # We want the last part: 2025-07-31-09-44-01 + pattern = r'(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})$' + match = re.search(pattern, filename) + + if match: + timestamp_str = match.group(1) + try: + # Parse the timestamp + dt = datetime.strptime(timestamp_str, '%Y-%m-%d-%H-%M-%S') + return dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S') + except ValueError as e: + print(f"Error parsing timestamp {timestamp_str}: {e}") + return None, None + else: + print(f"No timestamp found in filename: {filename}") + return None, None + +def extract_config_from_final_file(file_path): + """Extract configuration parameters from a final summary file.""" + print(f"\n=== Extracting config from: {os.path.basename(file_path)} ===") + + try: + with open(file_path, 'r') as f: + data = json.load(f) + + # Get config from precision section + if 'precision' not in data or not data['precision']: + print("No precision data found") + return None + + # Get the first precision level + first_precision_key = list(data['precision'].keys())[0] + config = data['precision'][first_precision_key].get('config', {}) + + # Extract parameters + shard_count = config.get('shard_count') + workers = config.get('workers') + k_ratio = config.get('k_ratio') + + print(f"Config extracted: shard_count={shard_count}, workers={workers}, k_ratio={k_ratio}") + + # Get k value from search section + k_value = None + if 'search' in data and data['search']: + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top') + print(f"K value from search params: {k_value}") + + # Get timestamps from search keys + timestamps = [] + if 'search' in data: + for search_key in data['search'].keys(): + date, time = parse_timestamp_from_filename(search_key) + if date and time: + timestamps.append((date, time)) + print(f"Found timestamp: {date} {time}") + + return { + 'shard_count': shard_count, + 'workers': workers, + 'k': k_value, + 'ratio': k_ratio, + 'timestamps': timestamps, + 'filename': os.path.basename(file_path) + } + + except Exception as e: + print(f"Error processing file: {e}") + return None + +def generate_table(): + """Generate the complete table from all final files.""" + print("\n" + "=" * 80) + print("Step 2c: Generating complete table") + print("=" * 80) + + files = list_final_files() + all_rows = [] + + for file_path in files: + print(f"\nProcessing: {os.path.basename(file_path)}") + config_data = extract_config_from_final_file(file_path) + + if config_data and config_data['timestamps']: + # Create a row for each timestamp in this file + for date, time in config_data['timestamps']: + row = { + 'date': date, + 'time': time, + 'shard_count': config_data['shard_count'], + 'workers': config_data['workers'], + 'k': config_data['k'], + 'ratio': config_data['ratio'] + } + all_rows.append(row) + print(f" Added row: {date} {time} | shards={row['shard_count']} | workers={row['workers']} | k={row['k']} | ratio={row['ratio']}") + + # Sort by date and time + all_rows.sort(key=lambda x: (x['date'], x['time'])) + + print(f"\n" + "=" * 80) + print("FINAL TABLE") + print("=" * 80) + print(f"{'Date':<12} {'Time':<10} {'Shard Count':<12} {'Workers':<8} {'K':<6} {'Ratio':<6}") + print("-" * 80) + + for row in all_rows: + print(f"{row['date']:<12} {row['time']:<10} {row['shard_count']:<12} {row['workers']:<8} {row['k']:<6} {row['ratio']:<6}") + + print(f"\nTotal rows: {len(all_rows)}") + return all_rows + +if __name__ == "__main__": + # Generate the complete table + table_data = generate_table() diff --git a/generate_baseline.py b/generate_baseline.py index 77bf90a0..4154d022 100644 --- a/generate_baseline.py +++ b/generate_baseline.py @@ -1,10 +1,12 @@ import redis import json import numpy as np +from redis.commands.search.query import Query + MAX_INT = 10000000000000000000 # Connect to Redis -r = redis.Redis(host='54.78.191.248', port=13833, decode_responses=False, protocol=3) +r = redis.Redis(host='18.200.246.132', port=19283, decode_responses=False, protocol=3) queries_folder = '/home/ubuntu/vector-db-benchmark/datasets/arxiv-titles-384-angular/arxiv' queries_file = f'{queries_folder}/tests.jsonl' @@ -20,15 +22,36 @@ # Perform search with k=1000 and shard_k_ratio=1.0 query_vector = first_query['query'] -search_results = r.execute_command( - 'FT.SEARCH', 'idx', - f'*=>[KNN {k} @vector $query_vec AS score]', - 'PARAMS', '2', 'query_vec', np.array(query_vector, dtype=np.float32).tobytes(), - 'SORTBY', 'score', - 'RETURN', '1', 'score', - 'LIMIT', '0', f'{k}', - 'DIALECT', '4' +shard_k_ratio = 1.0 +shard_k_ratio_policy = f"{{$shard_k_ratio: {shard_k_ratio}}}" +prefilter_condition = "*" +q = ( + Query( + f"*=>[KNN $K @vector $vec_param AS vector_score]=>{shard_k_ratio_policy}" + ) + .sort_by("vector_score", asc=True) + .paging(0, k) + # .return_fields("vector_score", "abstract", "vector") + # performance is optimized for sorting operations on DIALECT 4 in different scenarios. + # check SORTBY details in https://redis.io/commands/ft.search/ + .dialect(4) + .timeout(0) +) +search_results = r.ft().search( + q, query_params={ + "vec_param": np.array(query_vector, dtype=np.float32).tobytes(), + "K": k, + } ) +# search_results = r.execute_command( +# 'FT.SEARCH', 'idx', +# f'*=>[KNN {k} @vector $query_vec AS score]', +# 'PARAMS', '2', 'query_vec', np.array(query_vector, dtype=np.float32).tobytes(), +# 'SORTBY', 'score', +# 'RETURN', '1', 'score', +# 'LIMIT', '0', f'{k}', +# 'DIALECT', '4' +# ) results = search_results[b'results'] results_count = len(results) @@ -95,4 +118,4 @@ def generate_baseline_for_queries(num_queries=MAX_INT): print(f"Saved {len(baseline_queries)} baseline queries to {output_file}") # -generate_baseline_for_queries(num_queries=5000) +# generate_baseline_for_queries(num_queries=5000) From 1c37c6bfc72231016040607a367dba8a7c1cd1bf Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 31 Jul 2025 11:53:49 +0000 Subject: [PATCH 10/14] db port --- db_to_port_and_host | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/db_to_port_and_host b/db_to_port_and_host index c18950b4..b8d168c6 100644 --- a/db_to_port_and_host +++ b/db_to_port_and_host @@ -8,6 +8,7 @@ WORKERS=8 SHARD_COUNT=5 REDIS_PORT=18861 host=18.200.246.13218861 # WITH FILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19283 host=18.200.246.132 # WITH FILTERS WITH QPF ALL filters WORKERS=8 SHARD_COUNT=5 REDIS_PORT=13730 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters +WORKERS=8 SHARD_COUNT=2 REDIS_PORT=15463 host=54.195.17.70 # WITH FILTERS WITH QPF ALL filters SHARD_COUNT=1 K_RATIO=1.0 REDIS_PORT=16556 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 @@ -19,7 +20,7 @@ WORKERS=8 SHARD_COUNT=10 REDIS_PORT=13730 python3 run.py --host 54.195.17.70 --e # search NOFILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19387 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-no-filters --parallels 100 --skip-upload --queries 5000 # search FILTERS -WORKERS=8 SHARD_COUNT=10 REDIS_PORT=19283 K_RATIO=1.0 python3 run.py --host 18.200.246.132 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 300 --skip-upload --queries 5000 +WORKERS=8 SHARD_COUNT=5 REDIS_PORT=13730 K_RATIO=1.0 python3 run.py --host 54.195.17.70 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 300 --skip-upload --queries 5000 # downloaddataset FILTERS WORKERS=8 SHARD_COUNT=10 REDIS_PORT=16318 K_RATIO=1.0 python3 run.py --host 18.203.186.188 --engines redis-hnsw-m-16-ef-128 --datasets arxiv-titles-384-angular-filters --parallels 100 --skip-upload --skip-search --queries 5000 From c8755b2840dd81acbba3f44dad9533012eaeeb3e Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 3 Aug 2025 04:25:27 +0000 Subject: [PATCH 11/14] add latency grpah --- analyze_shard_k_ratio.py | 205 +++++++++++++++++++++++++++++++++++---- 1 file changed, 188 insertions(+), 17 deletions(-) diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py index 8da5ef7e..2b83db7b 100644 --- a/analyze_shard_k_ratio.py +++ b/analyze_shard_k_ratio.py @@ -43,9 +43,6 @@ def get_wrokers_from_filename(filename: str) -> int: # Old format: ...k_ratio_1.0-summary.json workers = "8" # or "unknown" - your choice for default - # Remove -summary.json suffix - name_parts = basename.replace("-summary.json", "").split("-") - return int(workers) def load_benchmark_data(file_path: str) -> Dict: @@ -142,6 +139,21 @@ def organize_data_by_k_and_shards(summaries: List[Dict]) -> Dict[tuple, List[Dic return by_k_and_shards +def organize_data_by_shards_and_workers(summaries: List[Dict]) -> Dict[tuple, List[Dict]]: + """Organize summary data by (shard_count, workers) combination for multi-k graphs.""" + by_shards_and_workers = {} + + for summary in summaries: + shard_count = summary['shard_count'] + workers = summary['workers'] + key = (shard_count, workers) + + if key not in by_shards_and_workers: + by_shards_and_workers[key] = [] + by_shards_and_workers[key].append(summary) + + return by_shards_and_workers + def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output_dir: str): """Create a graph for a specific shard count showing ratio vs performance/recall.""" if not summaries: @@ -197,11 +209,21 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output ax1.set_xlim(min(x_axis_effective_k) * 0.95, max(x_axis_effective_k) * 1.05) # Create second y-axis for precision (recall) + # we are lmited to k=5000 baseline results ax2 = ax1.twinx() - color2 = 'tab:red' - ax2.set_ylabel('Precision (Recall)', color=color2) - line2 = ax2.plot(effective_k_values, precision_values, 's-', color=color2, label='Precision', linewidth=2, markersize=6) - ax2.tick_params(axis='y', labelcolor=color2) + if k_value <= 5000: + color2 = 'tab:red' + ax2.set_ylabel('Precision (Recall)', color=color2) + line2 = ax2.plot(effective_k_values, precision_values, 's-', color=color2, label='Precision', linewidth=2, markersize=6) + ax2.tick_params(axis='y', labelcolor=color2) + + # Add labels for Precision values (red line) - positioned closer below points + for i, (x, y) in enumerate(zip(effective_k_values, precision_values)): + # Alternate positioning: odd indices go slightly lower to avoid overlap + offset_y = -12 if i % 2 == 0 else -16 + ax2.annotate(f'{y:.2f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', + fontsize=8, color=color2, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color2)) # Format precision y-axis to show max 3 decimal places ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.3f}')) @@ -214,14 +236,6 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output fontsize=8, color=color1, weight='bold', bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color1)) - # Add labels for Precision values (red line) - positioned closer below points - for i, (x, y) in enumerate(zip(effective_k_values, precision_values)): - # Alternate positioning: odd indices go slightly lower to avoid overlap - offset_y = -12 if i % 2 == 0 else -16 - ax2.annotate(f'{y:.2f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', - fontsize=8, color=color2, weight='bold', - bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color2)) - # Add selective improvement annotations (Option C) if len(summaries) >= 2: # Find the point with ratio 1.0 (baseline) and lowest ratio @@ -246,7 +260,10 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output # Option A: Place annotation outside graph area, bottom-left corner lowest_ratio = ratios[lowest_ratio_idx] - annotation_text = f"Ratio: {lowest_ratio:.1f} vs 1.0\n{qps_improvement:+.0f}% QPS\n{precision_change:+.1f}% Precision" + if k_value <= 5000: + annotation_text = f"Ratio: {lowest_ratio:.1f} vs 1.0\n{qps_improvement:+.0f}% QPS\n{precision_change:+.1f}% Precision" + else: + annotation_text = f"Ratio: {lowest_ratio:.1f} vs 1.0\n{qps_improvement:+.0f}% QPS" # Position outside the plot area in bottom-left, below x-axis labels and legend ax1.text(0.02, -0.13, annotation_text, transform=ax1.transAxes, @@ -257,7 +274,10 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output plt.title(f'Performance vs EffectiveK - {shard_count} Shards (K={k_value})', fontsize=14, fontweight='bold') # Create proper legend below x-axis, centered in one line - lines = line1 + line2 + if k_value <= 5000: + lines = line1 + line2 + else: + lines = line1 labels = [l.get_label() for l in lines] ax1.legend(lines, labels, bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=2, frameon=False, fontsize=10) @@ -282,6 +302,147 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output print(f" QPS range: {min(qps_values):.1f} - {max(qps_values):.1f}") print(f" Precision range: {min(precision_values):.3f} - {max(precision_values):.3f}") +def create_summary_graph_for_shards_workers(shard_count: int, workers: int, summaries: List[Dict], output_dir: str): + """Create a summary graph showing QPS vs effective k for multiple k values.""" + if not summaries: + print(f"No data for shard count {shard_count}, workers {workers}") + return + + # Group summaries by k_value + by_k_value = {} + for summary in summaries: + k_value = summary['k_value'] + if k_value not in by_k_value: + by_k_value[k_value] = [] + by_k_value[k_value].append(summary) + + # Create the plot + fig, ax = plt.subplots(figsize=(12, 8)) + + colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray'] + + for i, (k_value, k_summaries) in enumerate(sorted(by_k_value.items())): + # Sort by k_ratio for proper line plotting + k_summaries = sorted(k_summaries, key=lambda x: x['k_ratio']) + + # Extract data for plotting + effective_k_values = [calculate_effective_k(s['k_ratio'], shard_count, s['k_value']) for s in k_summaries] + ratios = [s['k_ratio'] for s in k_summaries] + qps_values = [s['qps'] for s in k_summaries] + + # Plot line for this k value + color = colors[i % len(colors)] + ax.plot(effective_k_values, qps_values, 'o-', color=color, label=f'K={k_value}', + linewidth=2, markersize=6) + + # Set labels and title + ax.set_xlabel('Effective K', fontsize=12) + ax.set_ylabel('QPS (Queries Per Second)', fontsize=12) + ax.grid(True, alpha=0.3) + + # Set x-axis ticks at intervals of 1000 + all_effective_k = [] + for k_summaries in by_k_value.values(): + for s in k_summaries: + all_effective_k.append(calculate_effective_k(s['k_ratio'], shard_count, s['k_value'])) + + if all_effective_k: + min_k = min(all_effective_k) + max_k = max(all_effective_k) + # Create ticks at 1000 intervals + tick_start = (min_k // 1000) * 1000 + tick_end = ((max_k // 1000) + 1) * 1000 + x_ticks = list(range(int(tick_start), int(tick_end) + 1, 1000)) + ax.set_xticks(x_ticks) + + plt.title(f'QPS vs EffectiveK Summary - {shard_count} Shards, {workers} Workers', + fontsize=14, fontweight='bold') + + # Add legend + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True, fontsize=10) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"summary_shard_count_{shard_count}_workers_{workers}_qps_vs_effective_k.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" Summary graph saved: {filename}") + print(f" K values included: {sorted(by_k_value.keys())}") + +def create_p95_summary_graph_for_shards_workers(shard_count: int, workers: int, summaries: List[Dict], output_dir: str): + """Create a summary graph showing P95 latency vs effective k for multiple k values.""" + if not summaries: + print(f"No data for shard count {shard_count}, workers {workers}") + return + + # Group summaries by k_value + by_k_value = {} + for summary in summaries: + k_value = summary['k_value'] + if k_value not in by_k_value: + by_k_value[k_value] = [] + by_k_value[k_value].append(summary) + + # Create the plot + fig, ax = plt.subplots(figsize=(12, 8)) + + colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray'] + + for i, (k_value, k_summaries) in enumerate(sorted(by_k_value.items())): + # Sort by k_ratio for proper line plotting + k_summaries = sorted(k_summaries, key=lambda x: x['k_ratio']) + + # Extract data for plotting + effective_k_values = [calculate_effective_k(s['k_ratio'], shard_count, s['k_value']) for s in k_summaries] + p95_values = [s['p95'] for s in k_summaries] + + # Plot line for this k value + color = colors[i % len(colors)] + ax.plot(effective_k_values, p95_values, 'o-', color=color, label=f'K={k_value}', + linewidth=2, markersize=6) + + # Set labels and title + ax.set_xlabel('Effective K', fontsize=12) + ax.set_ylabel('P95 Latency (ms)', fontsize=12) + ax.grid(True, alpha=0.3) + + # Set x-axis ticks at intervals of 1000 + all_effective_k = [] + for k_summaries in by_k_value.values(): + for s in k_summaries: + all_effective_k.append(calculate_effective_k(s['k_ratio'], shard_count, s['k_value'])) + + if all_effective_k: + min_k = min(all_effective_k) + max_k = max(all_effective_k) + # Create ticks at 1000 intervals + tick_start = (min_k // 1000) * 1000 + tick_end = ((max_k // 1000) + 1) * 1000 + x_ticks = list(range(int(tick_start), int(tick_end) + 1, 1000)) + ax.set_xticks(x_ticks) + + plt.title(f'P95 Latency vs EffectiveK Summary - {shard_count} Shards, {workers} Workers', + fontsize=14, fontweight='bold') + + # Add legend + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True, fontsize=10) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"summary_shard_count_{shard_count}_workers_{workers}_p95_vs_effective_k.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" P95 summary graph saved: {filename}") + print(f" K values included: {sorted(by_k_value.keys())}") + def main(): """Main function to orchestrate the analysis.""" print("Starting shard_k_ratio analysis...") @@ -335,6 +496,16 @@ def main(): print(f"\nCreating graph for K={k_value}, {shard_count} shards, {workers} workers:") create_graph_for_shard_count(shard_count, summaries, OUTPUT_DIR) + # Create summary graphs for each (shard_count, workers) combination + by_shards_and_workers = organize_data_by_shards_and_workers(all_summaries) + print(f"\nCreating summary graphs for shard/worker combinations: {sorted(by_shards_and_workers.keys())}") + + for (shard_count, workers) in sorted(by_shards_and_workers.keys()): + summaries = by_shards_and_workers[(shard_count, workers)] + print(f"\nCreating summary graph for {shard_count} shards, {workers} workers:") + create_summary_graph_for_shards_workers(shard_count, workers, summaries, OUTPUT_DIR) + create_p95_summary_graph_for_shards_workers(shard_count, workers, summaries, OUTPUT_DIR) + print(f"\nAll graphs saved to: {OUTPUT_DIR}") print("Analysis complete!") From 1007c51e71430194229455aa7421d5f5578e4211 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 3 Aug 2025 10:21:28 +0000 Subject: [PATCH 12/14] uodae --- analyze_shard_k_ratio.py | 2 +- extract_table_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py index 2b83db7b..c75b8a40 100644 --- a/analyze_shard_k_ratio.py +++ b/analyze_shard_k_ratio.py @@ -192,7 +192,7 @@ def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output min_ratio = min(ratios) # Create x-axis values from min_ratio to 1.0 in 0.1 increments x_axis_ratios = [] - current_ratio = min_ratio + current_ratio = 0.0 while current_ratio <= 1.0: x_axis_ratios.append(round(current_ratio, 1)) current_ratio += 0.1 diff --git a/extract_table_data.py b/extract_table_data.py index a847b554..f0c67df5 100644 --- a/extract_table_data.py +++ b/extract_table_data.py @@ -8,7 +8,7 @@ import glob import math -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark-1/results/final" +RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" def extract_data_from_file(file_path): """Extract data from a single summary file.""" From 1bc7c65c772d7c4d28c03ed4f4799a458051e7f0 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 3 Aug 2025 11:51:52 +0000 Subject: [PATCH 13/14] add multi line (sahrd) to analyze_shark --- analyze_shard_k_ratio.py | 81 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py index c75b8a40..d1abbd28 100644 --- a/analyze_shard_k_ratio.py +++ b/analyze_shard_k_ratio.py @@ -15,7 +15,7 @@ from pathlib import Path import math # Configuration -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" +RESULTS_DIR = "results/final" OUTPUT_DIR = "./graphs" def find_summary_files() -> List[str]: @@ -154,6 +154,21 @@ def organize_data_by_shards_and_workers(summaries: List[Dict]) -> Dict[tuple, Li return by_shards_and_workers +def organize_data_by_k_and_workers(summaries: List[Dict]) -> Dict[tuple, List[Dict]]: + """Organize summary data by (k_value, workers) combination for multi-shard graphs.""" + by_k_and_workers = {} + + for summary in summaries: + k_value = summary['k_value'] + workers = summary['workers'] + key = (k_value, workers) + + if key not in by_k_and_workers: + by_k_and_workers[key] = [] + by_k_and_workers[key].append(summary) + + return by_k_and_workers + def create_graph_for_shard_count(shard_count: int, summaries: List[Dict], output_dir: str): """Create a graph for a specific shard count showing ratio vs performance/recall.""" if not summaries: @@ -443,6 +458,61 @@ def create_p95_summary_graph_for_shards_workers(shard_count: int, workers: int, print(f" P95 summary graph saved: {filename}") print(f" K values included: {sorted(by_k_value.keys())}") +def create_qps_vs_ratio_graph_for_k_workers(k_value: int, workers: int, summaries: List[Dict], output_dir: str): + """Create a QPS vs Ratio graph for a specific K value showing different shard counts as lines.""" + if not summaries: + print(f"No data for K={k_value}, workers {workers}") + return + + # Group summaries by shard_count + by_shard_count = {} + for summary in summaries: + shard_count = summary['shard_count'] + if shard_count not in by_shard_count: + by_shard_count[shard_count] = [] + by_shard_count[shard_count].append(summary) + + # Create the plot + fig, ax = plt.subplots(figsize=(12, 8)) + + colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray'] + + for i, (shard_count, shard_summaries) in enumerate(sorted(by_shard_count.items())): + # Sort by k_ratio for proper line plotting + shard_summaries = sorted(shard_summaries, key=lambda x: x['k_ratio']) + + # Extract data for plotting + ratios = [s['k_ratio'] for s in shard_summaries] + qps_values = [s['qps'] for s in shard_summaries] + + # Plot line for this shard count + color = colors[i % len(colors)] + ax.plot(ratios, qps_values, 'o-', color=color, label=f'{shard_count} Shards', + linewidth=2, markersize=6) + + # Set labels and title + ax.set_xlabel('Shard K Ratio', fontsize=12) + ax.set_ylabel('QPS (Queries Per Second)', fontsize=12) + ax.grid(True, alpha=0.3) + + plt.title(f'QPS vs Shard K Ratio - K={k_value}, {workers} Workers', + fontsize=14, fontweight='bold') + + # Add legend + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True, fontsize=10) + + # Improve layout + plt.tight_layout() + + # Save the graph + filename = f"qps_vs_ratio_k_{k_value}_workers_{workers}.png" + filepath = os.path.join(output_dir, filename) + plt.savefig(filepath, dpi=300, bbox_inches='tight') + plt.close() + + print(f" QPS vs Ratio graph saved: {filename}") + print(f" Shard counts included: {sorted(by_shard_count.keys())}") + def main(): """Main function to orchestrate the analysis.""" print("Starting shard_k_ratio analysis...") @@ -506,6 +576,15 @@ def main(): create_summary_graph_for_shards_workers(shard_count, workers, summaries, OUTPUT_DIR) create_p95_summary_graph_for_shards_workers(shard_count, workers, summaries, OUTPUT_DIR) + # Create QPS vs Ratio graphs for each (k_value, workers) combination + by_k_and_workers = organize_data_by_k_and_workers(all_summaries) + print(f"\nCreating QPS vs Ratio graphs for K/worker combinations: {sorted(by_k_and_workers.keys())}") + + for (k_value, workers) in sorted(by_k_and_workers.keys()): + summaries = by_k_and_workers[(k_value, workers)] + print(f"\nCreating QPS vs Ratio graph for K={k_value}, {workers} workers:") + create_qps_vs_ratio_graph_for_k_workers(k_value, workers, summaries, OUTPUT_DIR) + print(f"\nAll graphs saved to: {OUTPUT_DIR}") print("Analysis complete!") From 80c96670803c2fb99ec6c8e3b7180e6cf2d01e33 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 3 Aug 2025 14:33:58 +0000 Subject: [PATCH 14/14] upate --- analyze_k_impact.py | 4 +- analyze_k_summary.py | 191 +++++++++++++++++++++++++++ analyze_shard_k_ratio.py | 16 ++- extract_table_data.py | 2 +- generate_performance_table.py | 235 ++++++++++++++++++++++++++++++++++ 5 files changed, 443 insertions(+), 5 deletions(-) create mode 100644 analyze_k_summary.py create mode 100644 generate_performance_table.py diff --git a/analyze_k_impact.py b/analyze_k_impact.py index 8e3bb67b..a1f5ee55 100644 --- a/analyze_k_impact.py +++ b/analyze_k_impact.py @@ -14,7 +14,7 @@ import math # Configuration -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" +RESULTS_DIR = "results/final" OUTPUT_DIR = "./graphs_k_impact" def find_summary_files() -> List[str]: @@ -200,7 +200,7 @@ def create_combined_k_impact_graph(shard_count: int, workers: int, configs_data: """Create a combined graph with multiple lines, each representing an effective K value.""" # Filter configs that have multiple K values - valid_configs = {eff_k: summaries for eff_k, summaries in configs_data.items() if len(summaries) >= 2} + valid_configs = {eff_k: summaries for eff_k, summaries in configs_data.items() if len(summaries) >= 1} if not valid_configs: print(f"Skipping combined graph for {shard_count} shards, {workers} workers - no configs with multiple K values") diff --git a/analyze_k_summary.py b/analyze_k_summary.py new file mode 100644 index 00000000..a0741502 --- /dev/null +++ b/analyze_k_summary.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +Script to generate K value analysis summary from benchmark results. + +This script analyzes the results and generates a summary showing QPS improvements +for different shard configurations and K ratios compared to baseline (ratio 1.0). +""" + +import json +import os +import glob +import math +from typing import Dict, List, Optional +from collections import defaultdict + +# Configuration +RESULTS_DIR = "results/final" + +def find_summary_files() -> List[str]: + """Find all summary JSON files in the results directory.""" + pattern = os.path.join(RESULTS_DIR, "*-summary.json") + files = glob.glob(pattern) + return files + +def get_workers_from_filename(filename: str) -> int: + """Parse filename to extract workers.""" + basename = os.path.basename(filename) + + if "workers_" in basename: + workers_part = basename.split("workers_")[1] + workers = workers_part.split("-")[0] + else: + workers = "8" # default + + return int(workers) + +def load_benchmark_data(file_path: str) -> Dict: + """Load and parse a benchmark summary JSON file.""" + try: + with open(file_path, 'r') as f: + data = json.load(f) + return data + except Exception as e: + print(f"Error loading {file_path}: {e}") + return {} + +def calculate_effective_k(k_ratio, shard_count, k): + """Calculate effective K value based on ratio and shard configuration.""" + num_shards = shard_count + k_ratio_config = k_ratio + + k_min = math.ceil(k / num_shards) + k_per_shard_req = math.ceil(k * k_ratio_config) + effective_k = max(k_min, k_per_shard_req) + + return effective_k + +def calculate_actual_k_ratio(k_ratio, shard_count, k): + """Calculate actual K ratio based on effective K.""" + effective_k = calculate_effective_k(k_ratio, shard_count, k) + k_ratio_actual = effective_k / k + return k_ratio_actual + +def extract_summary_data(data: Dict, filename: str) -> Optional[Dict]: + """Extract summary data from a benchmark JSON file.""" + if 'precision_summary' not in data or 'precision' not in data: + return None + + precision_data = data['precision'] + if not precision_data: + return None + + # Get configuration from first precision entry + first_precision_key = list(precision_data.keys())[0] + config = precision_data[first_precision_key].get('config', {}) + + k_ratio = config.get('k_ratio') + shard_count = config.get('shard_count') + workers = get_workers_from_filename(filename) + + if k_ratio is None or shard_count is None: + return None + + # Get K value from search section + k_value = None + if 'search' in data and data['search']: + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top') + + if k_value is None: + return None + + # Extract performance data from precision_summary + precision_summary = data['precision_summary'] + best_precision = max(precision_summary.keys(), key=float) + best_data = precision_summary[best_precision] + + return { + 'filename': filename, + 'k_ratio': k_ratio, + 'actual_k_ratio': calculate_actual_k_ratio(k_ratio, shard_count, k_value), + 'shard_count': shard_count, + 'workers': workers, + 'k_value': k_value, + 'precision': float(best_precision), + 'qps': best_data.get('qps', 0), + 'p50': best_data.get('p50', 0), + 'p95': best_data.get('p95', 0), + } + +def organize_data_by_k_value(summaries: List[Dict]) -> Dict[int, Dict[int, List[Dict]]]: + """Organize data by K value, then by shard count.""" + by_k_value = defaultdict(lambda: defaultdict(list)) + + for summary in summaries: + k_value = summary['k_value'] + shard_count = summary['shard_count'] + by_k_value[k_value][shard_count].append(summary) + + return dict(by_k_value) + +def generate_k_analysis_summary(data_by_k: Dict[int, Dict[int, List[Dict]]]): + """Generate and print the K analysis summary.""" + + for k_value in sorted(data_by_k.keys()): + print(f"K={k_value} Data Analysis:") + print() + + shard_data = data_by_k[k_value] + + for shard_count in sorted(shard_data.keys()): + shard_summaries = shard_data[shard_count] + + # Sort by k_ratio to ensure baseline (1.0) comes first + shard_summaries = sorted(shard_summaries, key=lambda x: x['k_ratio'], reverse=True) + + print(f"{shard_count} Shards:") + print() + + # Find baseline (ratio 1.0) QPS + baseline_qps = None + for summary in shard_summaries: + if abs(summary['k_ratio'] - 1.0) < 0.001: # Handle floating point comparison + baseline_qps = summary['qps'] + print(f"Ratio {summary['k_ratio']:.2f}: {summary['qps']:.1f} QPS") + break + + # Print other ratios with improvement percentages + for summary in shard_summaries: + if abs(summary['k_ratio'] - 1.0) >= 0.001: # Not baseline + qps = summary['qps'] + if baseline_qps and baseline_qps > 0: + improvement = ((qps - baseline_qps) / baseline_qps) * 100 + print(f"Ratio {summary['k_ratio']:.2f}: {qps:.1f} QPS → {improvement:+.0f}% improvement") + else: + print(f"Ratio {summary['k_ratio']:.2f}: {qps:.1f} QPS") + + print() + +def main(): + """Main function to run the analysis.""" + print("K Value Analysis Summary") + print("=" * 50) + print() + + # Find and load all summary files + files = find_summary_files() + print(f"Processing {len(files)} summary files...") + print() + + # Extract data from all files + all_summaries = [] + for file_path in files: + data = load_benchmark_data(file_path) + if data: + summary = extract_summary_data(data, file_path) + if summary: + all_summaries.append(summary) + + print(f"Successfully processed {len(all_summaries)} files") + print() + + # Organize data by K value and shard count + data_by_k = organize_data_by_k_value(all_summaries) + + # Generate and print the summary + generate_k_analysis_summary(data_by_k) + +if __name__ == "__main__": + main() diff --git a/analyze_shard_k_ratio.py b/analyze_shard_k_ratio.py index d1abbd28..e48a834e 100644 --- a/analyze_shard_k_ratio.py +++ b/analyze_shard_k_ratio.py @@ -490,6 +490,14 @@ def create_qps_vs_ratio_graph_for_k_workers(k_value: int, workers: int, summarie ax.plot(ratios, qps_values, 'o-', color=color, label=f'{shard_count} Shards', linewidth=2, markersize=6) + # Add labels for QPS values - positioned closer to points + for j, (x, y) in enumerate(zip(ratios, qps_values)): + # Alternate positioning: odd indices go slightly higher to avoid overlap + offset_y = 8 if j % 2 == 0 else 12 + ax.annotate(f'{y:.1f}', (x, y), textcoords="offset points", xytext=(0, offset_y), ha='center', + fontsize=8, color=color, weight='bold', + bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.7, edgecolor=color)) + # Set labels and title ax.set_xlabel('Shard K Ratio', fontsize=12) ax.set_ylabel('QPS (Queries Per Second)', fontsize=12) @@ -498,8 +506,12 @@ def create_qps_vs_ratio_graph_for_k_workers(k_value: int, workers: int, summarie plt.title(f'QPS vs Shard K Ratio - K={k_value}, {workers} Workers', fontsize=14, fontweight='bold') - # Add legend - ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True, fontsize=10) + # Create proper legend below x-axis, centered in one line + ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', + ncol=len(by_shard_count), frameon=False, fontsize=10) + + # Adjust plot margins to prevent label truncation + plt.subplots_adjust(top=0.85, bottom=0.15) # Improve layout plt.tight_layout() diff --git a/extract_table_data.py b/extract_table_data.py index f0c67df5..4d511768 100644 --- a/extract_table_data.py +++ b/extract_table_data.py @@ -8,7 +8,7 @@ import glob import math -RESULTS_DIR = "/home/ubuntu/vector-db-benchmark/results/final" +RESULTS_DIR = "results/final" def extract_data_from_file(file_path): """Extract data from a single summary file.""" diff --git a/generate_performance_table.py b/generate_performance_table.py new file mode 100644 index 00000000..7f3a08e7 --- /dev/null +++ b/generate_performance_table.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +""" +Generate performance comparison table showing improvements from baseline (ratio=1.0) +to best performing ratio for each shard/K configuration. +""" + +import json +import os +import glob +import math +from typing import Dict, List, Optional, Tuple +import csv +from io import StringIO + +# Configuration +RESULTS_DIR = "results/final" + +def get_workers_from_filename(filename: str) -> int: + """Parse filename to extract workers count.""" + basename = os.path.basename(filename) + if "workers_" in basename: + workers_part = basename.split("workers_")[1] + workers = workers_part.split("-")[0] + else: + workers = "8" # default + return int(workers) + +def load_benchmark_data(file_path: str) -> Dict: + """Load and parse a benchmark summary JSON file.""" + try: + with open(file_path, 'r') as f: + data = json.load(f) + return data + except Exception as e: + print(f"Error loading {file_path}: {e}") + return {} + +def calculate_effective_k(k_ratio, shard_count, k): + """Calculate effective K per shard.""" + k_min = math.ceil(k / shard_count) + k_per_shard_req = math.ceil(k * k_ratio) + effective_k = max(k_min, k_per_shard_req) + return effective_k + +def calculate_actual_k_ratio(k_ratio, shard_count, k): + """Calculate actual K ratio based on effective K.""" + effective_k = calculate_effective_k(k_ratio, shard_count, k) + return effective_k / k + +def extract_summary_data(data: Dict, filename: str) -> Optional[Dict]: + """Extract summary data from a benchmark JSON file.""" + if 'precision_summary' not in data or 'precision' not in data: + return None + + precision_data = data['precision'] + if not precision_data: + return None + + # Get config from first precision entry + first_precision_key = list(precision_data.keys())[0] + config = precision_data[first_precision_key].get('config', {}) + + k_ratio = config.get('k_ratio') + shard_count = config.get('shard_count') + workers = get_workers_from_filename(filename) + + if k_ratio is None or shard_count is None: + return None + + # Get K value from search section + k_value = None + if 'search' in data and data['search']: + first_search_key = list(data['search'].keys())[0] + search_params = data['search'][first_search_key].get('params', {}) + k_value = search_params.get('top') + + if k_value is None: + return None + + # Extract performance data from precision_summary + precision_summary = data['precision_summary'] + best_precision = max(precision_summary.keys(), key=float) + best_data = precision_summary[best_precision] + + return { + 'filename': filename, + 'k_ratio_config': k_ratio, + 'k_ratio_actual': calculate_actual_k_ratio(k_ratio, shard_count, k_value), + 'shard_count': shard_count, + 'workers': workers, + 'k_value': k_value, + 'precision': float(best_precision), + 'qps': best_data.get('qps', 0), + 'p50': best_data.get('p50', 0), + 'p95': best_data.get('p95', 0), + } + +def find_baseline_and_best(summaries: List[Dict]) -> Tuple[Optional[Dict], Optional[Dict]]: + """Find baseline (ratio=1.0) and best performing configuration.""" + baseline = None + best_qps = None + + for summary in summaries: + # Find baseline (closest to ratio 1.0) + if abs(summary['k_ratio_actual'] - 1.0) < 0.01: + baseline = summary + + # Find best QPS + if best_qps is None or summary['qps'] > best_qps['qps']: + best_qps = summary + + return baseline, best_qps + +def calculate_improvements(baseline: Dict, best: Dict) -> Dict: + """Calculate percentage improvements from baseline to best.""" + if not baseline or not best: + return {} + + qps_improvement = ((best['qps'] - baseline['qps']) / baseline['qps']) * 100 + + # For latency, lower is better, so improvement is negative change + p95_improvement = ((baseline['p95'] - best['p95']) / baseline['p95']) * 100 + + # For accuracy, calculate loss (negative improvement) + accuracy_change = ((best['precision'] - baseline['precision']) / baseline['precision']) * 100 + + return { + 'qps_improvement': qps_improvement, + 'latency_improvement': p95_improvement, + 'accuracy_change': accuracy_change, + 'ratio_change': f"{baseline['k_ratio_actual']:.2f} → {best['k_ratio_actual']:.2f}" + } + +def main(): + """Main function to generate the performance table.""" + print("Generating performance comparison table...") + + # Find all summary files + pattern = os.path.join(RESULTS_DIR, "*-summary.json") + files = glob.glob(pattern) + print(f"Found {len(files)} summary files") + + # Process all files + all_summaries = [] + for file_path in files: + data = load_benchmark_data(file_path) + if data: + summary = extract_summary_data(data, os.path.basename(file_path)) + if summary: + all_summaries.append(summary) + + print(f"Extracted {len(all_summaries)} valid summaries") + + # Group by (shard_count, k_value, workers) + by_config = {} + for summary in all_summaries: + key = (summary['shard_count'], summary['k_value'], summary['workers']) + if key not in by_config: + by_config[key] = [] + by_config[key].append(summary) + + # Generate table data + table_rows = [] + + for (shard_count, k_value, workers), summaries in sorted(by_config.items()): + baseline, best = find_baseline_and_best(summaries) + + if baseline and best and baseline != best: + improvements = calculate_improvements(baseline, best) + if improvements: + table_rows.append({ + 'shard_count': shard_count, + 'k_value': k_value, + 'workers': workers, + **improvements + }) + + # Print formatted table + print("\n" + "="*100) + print("PERFORMANCE COMPARISON TABLE") + print("="*100) + print(f"{'Shard Count':<12} {'K Value':<8} {'Workers':<8} {'Ratio Change':<15} {'QPS Improvement':<16} {'Latency Improvement':<20} {'Accuracy Change':<15}") + print("-"*100) + + for row in table_rows: + print(f"{row['shard_count']:<12} {row['k_value']:<8} {row['workers']:<8} " + f"{row['ratio_change']:<15} {row['qps_improvement']:+.0f}%{'':<11} " + f"{row['latency_improvement']:+.0f}%{'':<15} {row['accuracy_change']:+.1f}%") + + # Generate tab-separated table (easy to copy) + print("\n" + "="*100) + print("TAB-SEPARATED TABLE (copy-paste ready):") + print("="*100) + print("Shard Count\tK Value\tWorkers\tRatio Change\tQPS Improvement (%)\tLatency Improvement (%)\tAccuracy Change (%)") + + for row in table_rows: + print(f"{row['shard_count']}\t{row['k_value']}\t{row['workers']}\t" + f"{row['ratio_change']}\t{row['qps_improvement']:+.0f}\t" + f"{row['latency_improvement']:+.0f}\t{row['accuracy_change']:+.1f}") + + # Generate CSV output + print("\n" + "="*100) + print("CSV FORMAT (copy-paste ready):") + print("="*100) + + csv_output = StringIO() + csv_writer = csv.writer(csv_output) + + # Write header + csv_writer.writerow(['Shard Count', 'K Value', 'Workers', 'Ratio Change', 'QPS Improvement (%)', 'Latency Improvement (%)', 'Accuracy Change (%)']) + + # Write data rows + for row in table_rows: + csv_writer.writerow([ + row['shard_count'], + row['k_value'], + row['workers'], + row['ratio_change'], + f"{row['qps_improvement']:+.0f}", + f"{row['latency_improvement']:+.0f}", + f"{row['accuracy_change']:+.1f}" + ]) + + print(csv_output.getvalue()) + + # Save to file + csv_filename = "performance_comparison.csv" + with open(csv_filename, 'w', newline='') as csvfile: + csvfile.write(csv_output.getvalue()) + + print(f"\nResults saved to: {csv_filename}") + print(f"Total configurations analyzed: {len(table_rows)}") + +if __name__ == "__main__": + main()