From c01e91e9d8d67609309366e6cd1d1f6e9e8e614b Mon Sep 17 00:00:00 2001
From: John Bowyer <jcbowyer@hotmail.com>
Date: Fri, 24 Jan 2025 12:35:00 -0500
Subject: [PATCH 1/2] docs:  Add smartbrevity overview materials for Data
 Engineering Boot Camp summaries.

---
 ...ng_boot_camp_kick_off_and_informational.md | 100 +++++++++++++++
 ...eling_complex_data_types_and_cumulation.md | 113 +++++++++++++++++
 ..._cumulative_dimensions_struct_and_array.md |  85 +++++++++++++
 ...wly_changing_dimensions_and_idempotency.md |  94 ++++++++++++++
 ...uilding_slowly_changing_dimensions_scds.md |  93 ++++++++++++++
 ...ing_graph_databases_additive_dimensions.md |  98 ++++++++++++++
 .../week_1_w_free_boot_camp_live_stream.md    | 115 +++++++++++++++++
 ...2_how_meta_models_big_volume_event_data.md | 120 ++++++++++++++++++
 ...ek_2_w_free_boot_camp_live_black_friday.md |  96 ++++++++++++++
 ...gold_pipeline_like_airbnb_midas_process.md | 102 +++++++++++++++
 ...ce_spark_dataframe_dataset_udfs_caching.md | 110 ++++++++++++++++
 .../week_3_master_data_contracts.md           | 111 ++++++++++++++++
 ...k_iceberg_memory_tuning_joins_partition.md |  95 ++++++++++++++
 ...eek_3_testing_apache_spark_jobs_in_cicd.md | 100 +++++++++++++++
 .../week_3_w3_free_boot_camp_live_stream.md   |  81 ++++++++++++
 ...sign_patterns_at_meta_growth_accounting.md |  88 +++++++++++++
 .../week_4_w4_free_boot_camp_live.md          |  92 ++++++++++++++
 ...adriven_business_value_with_tableau_viz.md | 114 +++++++++++++++++
 ..._data_pipelines_like_netflix_and_airbnb.md |  46 +++++++
 ...ew_year_and_free_boot_camp_wrap_up_live.md |  39 ++++++
 ...tone_projects_for_analytics_engineering.md | 107 ++++++++++++++++
 ...ime_data_pipelines_with_kafka_and_flink.md | 113 +++++++++++++++++
 22 files changed, 2112 insertions(+)
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_master_data_contracts.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_spark_iceberg_memory_tuning_joins_partition.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_testing_apache_spark_jobs_in_cicd.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_w3_free_boot_camp_live_stream.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_4_data_engineer_design_patterns_at_meta_growth_accounting.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_4_w4_free_boot_camp_live.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_build_datadriven_business_value_with_tableau_viz.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_maintain_data_pipelines_like_netflix_and_airbnb.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_new_year_and_free_boot_camp_wrap_up_live.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_end_additional_jobready_capstone_projects_for_analytics_engineering.md
 create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_end_additional_master_realtime_data_pipelines_with_kafka_and_flink.md

diff --git a/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md b/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md
new file mode 100644
index 000000000..6a3d37394
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md
@@ -0,0 +1,100 @@
+# Free Data Engineering Boot Camp Kickoff Summary
+
+```mermaid
+mindmap
+  root((Free Data Engineering Boot Camp))
+    Program Structure [00:01:01]
+        Six weeks intensive
+        1-2 hours daily commitment
+        Pre-recorded lessons
+        Two components per module
+            Lecture
+            Lab
+        AI-graded homework
+        Discord community support
+    Curriculum [00:06:02]
+        Data Modeling
+            Two weeks coverage
+            Foundation concepts
+            Data product focus
+        Analytical Patterns
+            Growth accounting
+            Advanced SQL
+            Window functions
+        KPIs & Experimentation
+            Metrics definition
+            Product thinking
+        Data Visualization
+            Communication
+            Tableau basics
+            Dashboard types
+        Infrastructure Track
+            Unit testing
+            Pipeline maintenance
+            Apache Spark fundamentals
+            Data quality patterns
+            Real-time pipelines
+    Certification Paths [00:19:40]
+        Watch-only certificate
+            Attendance tracking
+            Basic recognition
+        Full certification
+            Complete all homework
+            Watch all content
+            Expected 3-4% completion rate
+    Paid vs Free Differences [00:27:28]
+        Cloud infrastructure access
+            AWS deployment
+            One year access
+            Paid APIs available
+        Enhanced support
+            Weekly Q&A sessions
+            Industry expert speakers
+            Dedicated TA support
+        Additional content
+            Snowflake
+            Trino
+            DBT
+            Apache Iceberg
+        Capstone project
+            Dedicated feedback
+            Portfolio building
+            Job interview training
+```
+
+
+*A comprehensive 6-week program launching online with daily content releases at 5 PM Pacific.*
+
+**Big picture:** Tech expert Zach is offering free data engineering training to help 1,000 engineers land jobs by February 15, with content available on YouTube until December 2025.
+
+**Key details:**
+- 10,000+ enrolled students
+- 1-2 hours daily commitment recommended
+- All content pre-recorded and uploaded daily
+- Includes AI-graded homework assignments
+- Discord community support available
+
+**Core curriculum:**
+- Data modeling (2 weeks)
+- Analytical patterns and advanced SQL
+- KPIs and experimentation
+- Data visualization
+- Infrastructure and pipeline maintenance
+- Apache Spark fundamentals
+- Real-time pipelines with Flink and Kafka
+
+**Success metrics:** Only about 300-400 out of 10,000 students expected to complete certification, requiring:
+- Watching all videos
+- Completing all homework assignments
+- Active participation in community
+
+**What's different in paid version:**
+- Cloud infrastructure access
+- Weekly Q&As with Zach
+- Industry expert speakers
+- Additional tools: Snowflake, DBT, AWS Glue
+- Direct TA support
+- Capstone project mentorship
+- Job interview training
+
+**Bottom line:** While free version offers substantial technical training, paid version ($) provides more hands-on support and cloud-based tools for job preparation.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md
new file mode 100644
index 000000000..ded341c70
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md
@@ -0,0 +1,113 @@
+# Data Modeling: Complex Types and Cumulative Tables Deep Dive
+
+*A comprehensive look at dimensional data modeling principles, focusing on the balance between data efficiency and usability.*
+
+
+```mermaid
+mindmap
+  root((Dimensional
+    Data
+    Modeling - Intro))
+    (Understanding Dimensions)
+      (Identifier Dimensions)
+        (Uniquely identify entities)
+          (User ID)
+          (Social Security)
+          (Device ID)
+      (Attributes)
+        (Slowly Changing)
+          (Time dependent values)
+          (Can change over time)
+        (Fixed)
+          (Birthday)
+          (Phone manufacturer)
+    (Data Modeling Types)
+      (OLTP)
+        (Online Transaction Processing)
+        (Normalized)
+        (Minimal duplication)
+        (Fast single row operations)
+      (Master Data)
+        (Middle ground)
+        (Complete entity definitions)
+        (Reference data)
+      (OLAP)
+        (Online Analytical Processing)
+        (Denormalized)
+        (Optimized for analysis)
+        (Population level queries)
+    (Cumulative Table Design)
+      (Historical Analysis)
+      (State Transitions)
+      (Uses Today + Yesterday data)
+      (Full outer join approach)
+      (Drawbacks)
+        (Sequential backfilling)
+        (PII management challenges)
+    (Complex Data Types)
+      (Struct)
+        (Table within table)
+        (Different value types)
+      (Array)
+        (Ordered lists)
+        (Same data type)
+      (Map)
+        (Key-value pairs)
+        (Same value type)
+    (Data Consumer Types)
+      (Data Analysts)
+        (Need simple flat data)
+        (Easy to query)
+      (Data Engineers)
+        (Can handle complex types)
+        (Build downstream pipelines)
+      (ML Models)
+        (Need identifier + features)
+        (Flat structure preferred)
+      (Customers)
+        (Need visualizations)
+        (Charts over raw data)
+    (Compactness vs Usability)
+      (Most Compact)
+        (Compressed data)
+        (Online systems)
+      (Middle Ground)
+        (Arrays and structs)
+        (Master data)
+      (Most Usable)
+        (Flat structure)
+        (Analytics focused)
+```
+
+
+**Big picture:** Data modeling strategies vary significantly based on end users' needs, from analysts requiring simple flat tables to engineers working with compressed complex data types.
+
+**Key dimensions types:**
+- Identifier dimensions (unique entity IDs)
+- Slowly changing dimensions (values change over time)
+- Fixed dimensions (unchangeable values)
+
+**Data modeling layers:**
+- OLTP (transactional): Optimized for single-record operations
+- Master data: Middle ground, combines completeness with efficiency
+- OLAP (analytical): Optimized for aggregation and analysis
+- Metrics: Highest level of aggregation
+
+**Cumulative table design benefits:**
+- Maintains complete historical records
+- Enables efficient historical analysis
+- Supports state transition tracking
+- Reduces query complexity
+
+**Watch out for:**
+- Sequential processing requirement limits parallel backfilling
+- Privacy concerns with historical data retention
+- Storage size growth over time
+- Shuffle operations breaking data sorting in distributed systems
+
+**Complex data types tradeoffs:**
+- Arrays: Best for ordered data, same-type elements
+- Structs: Flexible "table within table" approach
+- Maps: Dynamic key-value pairs with type restrictions
+
+**Bottom line:** Success in dimensional modeling requires understanding your data consumers and balancing compression efficiency with query usability. Master data serves as a critical middle ground between transactional and analytical needs.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md
new file mode 100644
index 000000000..982044c0d
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md
@@ -0,0 +1,85 @@
+# Building Cumulative Tables with Complex Data Types: Lab Tutorial
+
+*A hands-on demonstration of creating efficient dimensional tables using PostgreSQL arrays and structs to track NBA player statistics over time.*
+
+
+```mermaid
+mindmap
+  root((Dimensional Data Modeling
+        Lab))
+    (Data Structure)
+        [Player Seasons Table]
+            (Temporal Components)
+            (Player Attributes)
+                (Name)
+                (Height)
+                (College)
+                (Country)
+                (Draft Info)
+            (Season Stats)
+                (Games Played)
+                (Points)
+                (Rebounds)
+                (Assists)
+    (Data Types)
+        [Custom Types]
+            (season_stats struct)
+            (scoring_class enum)
+    (Table Design)
+        [Players Table]
+            (Primary Key)
+                (player_name)
+                (current_season)
+            (Non-temporal Columns)
+            (season_stats Array)
+            (Additional Metrics)
+                (years_since_last_season)
+                (scoring_class)
+    (Cumulative Pattern)
+        [Benefits]
+            (Maintains Data History)
+            (Efficient Joins)
+            (No Shuffling Required)
+            (Fast Analytics)
+        [Implementation]
+            (Yesterday Query)
+            (Today Query)
+            (Full Outer Join)
+            (Array Concatenation)
+    (Analytics Capabilities)
+        [Historical Analysis]
+            (Player Progress)
+            (Career Gaps)
+            (Performance Metrics)
+        [Data Transformations]
+            (Unnest Operations)
+            (Array Manipulations)
+```
+
+
+**Big picture:** Converting season-by-season player statistics into a cumulative table using complex data types reduces data duplication and maintains data sorting efficiency while enabling quick historical analysis.
+
+**Key components:**
+- Custom struct type for season statistics
+- Array column to store multiple seasons
+- Tracking columns for scoring class and years since last season
+- Full outer join logic for cumulative updates
+
+**Implementation steps:**
+- Create custom struct type for season stats (points, games, rebounds, assists)
+- Build base table with player attributes and season stats array
+- Implement incremental loading logic using full outer joins
+- Add derived columns for player classification and activity tracking
+
+**Performance benefits:**
+- No GROUP BY operations needed for historical analysis
+- Maintains data sorting after joins
+- Reduces storage through elimination of duplicated data
+- Enables fast parallel processing
+
+**Real-world example:**
+- Tracked Michael Jordan's career gap (1996-1997, returned 2001)
+- Demonstrated scoring progression from first to last season
+- Identified most improved players without expensive aggregations
+
+**Bottom line:** Complex data types with cumulative loading provide significant performance advantages for dimensional data that changes over time, while maintaining data usability through unnesting capabilities.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md
new file mode 100644
index 000000000..a5606e912
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md
@@ -0,0 +1,94 @@
+# Idempotency and Slowly Changing Dimensions in Data Engineering
+
+*A comprehensive guide to building reliable data pipelines and handling temporal dimension changes.*
+
+
+```mermaid
+mindmap
+  root((Data Modeling &
+    SCDs Intro))
+    )Idempotent Pipelines(
+      (Problems to Avoid)
+        Insert without truncate
+        Missing end date in queries
+        Incomplete partition sensors
+        Depends on past issues
+        Latest partition dependency
+      (Definition)
+        Same results regardless of
+          When ran
+          How many times ran
+          Time of day
+      (Benefits)
+        Reproducible results
+        Easier troubleshooting
+        Better unit testing
+        Fewer silent failures
+    )Slowly Changing Dimensions(
+      (Definition)
+        Attributes that change over time
+        Examples
+          Age
+          Phone preferences
+          Country
+          Food preferences
+      (Types)
+        Type 0
+          Static dimensions
+          Never change
+        Type 1
+          Only latest value
+          Not recommended for analytics
+        Type 2
+          Start and end dates
+          Complete history
+          Airbnb gold standard
+        Type 3
+          Original and current value
+          Limited history
+      (Implementation Methods)
+        Full history processing
+        Incremental loading
+      (Modeling Approaches)
+        Latest snapshot
+          Not recommended
+          Non-idempotent
+        Daily snapshots
+          Simple but storage heavy
+          Max's preferred approach
+        SCD modeling
+          Compressed history
+          Type 2 recommended
+```
+
+
+**Big picture:** Data pipelines must produce consistent results regardless of when or how many times they run. Slowly changing dimensions (SCDs) need careful modeling to maintain data accuracy over time.
+
+**Idempotency essentials:**
+- Pipelines should produce identical results whether run in production or backfill
+- Multiple runs should not create duplicates
+- Time windows must have clear boundaries
+- All input dependencies must be verified
+
+**Common idempotency pitfalls:**
+- Using INSERT without TRUNCATE
+- Unbounded date ranges
+- Missing partition sensors
+- Relying on latest data instead of point-in-time
+- Cumulative dependencies on non-idempotent sources
+
+**SCD modeling approaches:**
+- Type 0: Fixed dimensions that never change
+- Type 1: Latest value only (avoid for analytics)
+- Type 2: Full history with start/end dates (recommended)
+- Type 3: Original and current values only
+
+**Best practices for Type 2 SCDs:**
+- Use start and end dates for each value period
+- Mark current records with future end date
+- Can be loaded via full rebuild or incremental updates
+- Choose implementation based on data volume and business needs
+
+**Bottom line:** Idempotent pipelines and Type 2 SCDs are critical for reliable analytics, though implementation complexity should be balanced against business value and maintenance costs.
+
+**Watch out for:** Using latest snapshot dimensions in analytical pipelines - this breaks idempotency and can cause incorrect historical analysis.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md
new file mode 100644
index 000000000..0b50a05ec
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md
@@ -0,0 +1,93 @@
+# Building Slowly Changing Dimensions (SCD) in Data Modeling
+
+*A technical deep dive into implementing SCD Type 2 for tracking historical changes in dimensional data using PostgreSQL.*
+
+```mermaid
+mindmap
+  root((Data Modeling &
+    SCDs Intro Lab))
+    Table Structure
+        Primary Key
+            Player Name
+            Start Season
+        Tracking Columns
+            Scoring Class
+            Is Active
+        Time Columns
+            Start Season
+            End Season
+            Current Season
+    Full History Approach
+        Window Functions
+            LAG for Previous Values
+            Partition by Player Name
+            Order by Season
+        Change Detection
+            Compare Current vs Previous
+            Generate Change Indicators
+        Streak Identification
+            Sum Change Indicators
+            Group Records by Streaks
+        Advantages
+            Simple to Understand
+            Good for Smaller Dimensions
+        Disadvantages
+            Memory Intensive
+            Processes All History
+            Multiple Window Functions
+    Incremental Approach
+        Components
+            Historical SCD
+            Last Season SCD
+            This Season Data
+        Record Types
+            Unchanged Records
+                Extend End Date
+            Changed Records
+                Close Old Record
+                Create New Record
+            New Records
+                Create First Record
+        Advantages
+            Less Data Processing
+            More Efficient
+            Better for Large Dimensions
+        Disadvantages
+            More Complex Logic
+            Sequential Dependencies
+            Harder to Backfill
+    Best Practices
+        Handle Null Values
+        Check Data Quality
+        Consider Data Volume
+        Document Assumptions
+        Test Edge Cases
+```
+
+**Big picture:** Two approaches demonstrated for tracking dimensional changes over time - a full historical rebuild and an incremental update method.
+
+**Key components of SCD Type 2:**
+- Start and end dates for each dimension record
+- Support for tracking multiple changing attributes
+- Maintains complete historical record of changes
+- Primary key based on entity name and start date
+
+**Full historical rebuild approach:**
+- Uses window functions to detect changes
+- Generates streak identifiers for tracking changes
+- More memory-intensive but simpler to implement
+- Works well for smaller dimensional tables (millions of records)
+- Requires scanning all historical data
+
+**Incremental update method:**
+- Processes only changed records and new data
+- More complex query logic but better performance
+- Handles three scenarios:
+  - Unchanged records (extend end date)
+  - Changed records (close old record, create new)
+  - New records (create initial record)
+- Better for larger datasets but requires sequential processing
+
+**Bottom line:** Choice between approaches depends on data volume and processing requirements. Full rebuild is simpler but less efficient; incremental update is more complex but better performing at scale.
+
+**Watch out for:** Null handling in dimensional attributes can break comparison logic. Always validate assumptions about data quality when implementing either approach.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md
new file mode 100644
index 000000000..e2c9994d1
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md
@@ -0,0 +1,98 @@
+# Graph Data Modeling and Advanced Dimensional Concepts
+
+*A deep dive into graph databases, additive dimensions, and flexible schema patterns for complex data modeling scenarios.*
+
+
+```mermaid
+mindmap
+  root((Dimensional Data
+    Modeling Graph Intro))
+    (Graph Data Modeling)
+        (Focus on Relationships)
+            (Less Entity Focused)
+            (Connection-Centric)
+        (Schema Structure)
+            (Flexible Schema)
+            (Basic Components)
+                (Vertices)
+                    (ID)
+                    (Type)
+                    (Properties)
+                (Edges)
+                    (Subject ID & Type)
+                    (Object ID & Type)
+                    (Edge Type)
+                    (Properties)
+    (Additive vs Non-Additive Dimensions)
+        (Additivity Concept)
+            (Time Window Dependency)
+            (Single Value Rule)
+        (Examples)
+            (Honda Cars vs Drivers)
+            (Age Groups)
+            (Device Users)
+        (Impact on Analytics)
+            (Count Metrics)
+            (Ratio Metrics)
+            (Aggregation Methods)
+    (Enums Power)
+        (Benefits)
+            (Built-in Data Quality)
+            (Static Fields)
+            (Documentation)
+        (Usage Guidelines)
+            (Less than 50 Values)
+            (Partition Values)
+        (Little Book of Pipelines)
+            (Source Functions)
+            (Shared Schema)
+            (Data Quality Checks)
+            (Partitioned Output)
+    (Flexible Data Types)
+        (Map Benefits)
+            (Schema Evolution)
+            (Dynamic Properties)
+            (Other Properties Column)
+        (Drawbacks)
+            (Poor Compression)
+            (Storage Overhead)
+```
+
+
+**Big picture:** Graph data modeling shifts focus from entities to relationships, using a standardized structure to track how different objects connect and interact.
+
+**Key concepts:**
+- Additive vs non-additive dimensions
+- Enumerated types for data quality
+- Flexible schemas using maps
+- Graph database fundamentals
+
+**Additive dimensions:**
+- Can aggregate subtotals directly
+- Entity can only have one value at a time
+- Examples: age groups, car counts
+- Time window affects additivity
+
+**Non-additive dimensions:**
+- Require count distinct operations
+- Entity can have multiple values simultaneously
+- Examples: platform users (iOS/Android), car drivers
+- Impact ratio metrics and aggregations
+
+**Graph database structure:**
+- Vertices (nodes):
+  - Identifier
+  - Type
+  - Properties map
+- Edges (relationships):
+  - Subject and object identifiers/types
+  - Edge type (usually verbs)
+  - Relationship properties
+
+**Pattern highlight - "Little Book of Pipelines":**
+- Uses enums to manage multiple data sources
+- Shared schema with custom quality checks
+- Scales well for large integration projects
+- Examples: unit economics, infrastructure graphs
+
+**Bottom line:** Choose modeling approach based on use case - enums for limited values (<50), flexible schemas for varying attributes, and graph databases when relationships matter more than entities themselves.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md b/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md
new file mode 100644
index 000000000..3a3e6ed97
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md
@@ -0,0 +1,115 @@
+# Data Expert Free Boot Camp Live Q&A Highlights
+
+*A comprehensive Q&A session covering data engineering career paths, boot camp details, and industry insights.*
+
+
+```mermaid
+mindmap
+  root((Data Engineering))
+    Career Growth
+      Skills
+        SQL & Python fundamentals
+        Cloud platforms
+          AWS primary focus
+        Apache tools
+          Spark
+          Kafka
+          Flink
+        Data formats
+          Parquet
+          ORC
+        Orchestration
+          Airflow
+      Portfolio Building
+        Build practical projects
+        Focus on impact
+        Show robustness
+        Demonstrate quality
+        Provide proof/visualization
+    Technologies
+      Cloud Platforms
+        AWS dominates 55% market
+        Snowflake for usability
+        Databricks for features
+      File Formats
+        Parquet preferred
+        ORC alternative
+        Iceberg for modern lakes
+      Databases
+        Vector DBs trending
+        Postgres versatile
+        Elastic for search
+    Industry Trends
+      AI Impact
+        Not replacing DE
+        Enhancing pipeline maintenance
+        Making work less stressful
+        Still needs data quality
+      Data Growth
+        More data expected
+        Growing importance of DE
+        Vector databases rising
+      Tools Evolution
+        Moving from Hadoop
+        Cloud-native solutions
+        Real-time processing
+    Best Practices
+      Pipeline Design
+        Include staging steps
+        Consider backfill efficiency
+        Monitor data quality
+        Handle failures gracefully
+      Learning Approach
+        Learn by doing
+        Build real projects
+        Network with seniors
+        Stay updated with trends
+      Career Development
+        Start as analyst if needed
+        Focus on fundamentals
+        Create content
+        Network actively
+    Work-Life
+      Time Management
+        Set boundaries
+        Balance learning & work
+        Prioritize important tasks
+      Skill Development
+        Continuous learning
+        Practical experience
+        Keep up with trends
+      Career Growth
+        Build portfolio
+        Network
+        Create content
+```
+
+**Big picture:** Zach Wilson addresses key questions about his free and paid data engineering boot camps, career transitions, and the future of data engineering amidst AI advancements.
+
+**Boot camp details:**
+- Free content available until January 31, 2025
+- New fact data modeling content releasing tomorrow (5 hours)
+- AI-graded homework assignments
+- Certification opportunities
+- Discord community support
+
+**Key career insights:**
+- Focus on AWS over Azure/GCP due to market share (55%)
+- Entry-level DE roles are limited; consider data analyst path first
+- Core skills needed: SQL, Python, cloud platforms
+- Practical experience trumps certifications
+
+**Technology perspectives:**
+- Kafka remains exciting for real-time data
+- Parquet vs ORC file formats both relevant
+- DBT valuable for enforcing good practices
+- Snowflake gaining momentum for usability
+
+**Business update:**
+- Free boot camp expanding reach (15,000 daily active users)
+- Paid boot camp ($2,000 with discount) filling up
+- Content helping both beginners and senior engineers
+
+**Bottom line:** Success in data engineering requires continuous learning, practical experience, and understanding business needs. Focus on fundamentals before chasing new technologies.
+
+**Watch out for:** Time zone compatibility with live sessions (particularly challenging for European participants).
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md b/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md
new file mode 100644
index 000000000..4a1c591ac
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md
@@ -0,0 +1,120 @@
+# Understanding Fact Data Modeling and Volume Optimization
+
+*A comprehensive exploration of fact data modeling techniques, focusing on data volume optimization and efficiency in large-scale systems.*
+
+
+```mermaid
+mindmap
+  root((Fact Data Modeling))
+    Fact Fundamentals
+      Definition
+        Atomic events
+        Cannot be broken down further
+        Represents actions/occurrences
+      Characteristics
+        High volume
+        Immutable
+        Time-based
+        Context dependent
+      Components
+        Who fields
+          User IDs
+          Device IDs
+        Where fields
+          Location
+          Page/section
+        When fields
+          Timestamps
+          UTC standardization
+        What fields
+          Event types
+          Actions
+        How fields
+          Methods
+          Tools used
+    Data Volume Management
+      Raw Facts
+        Highest granularity
+        Most flexible
+        Highest volume
+      Daily Aggregates
+        Medium volume
+        Grouped by day
+        Preserves some granularity
+      Reduced Facts
+        Lowest volume
+        Array-based storage
+        Monthly/yearly grouping
+    Performance Optimization
+      Shuffle Minimization
+        Reduce data volume
+        Pre-bucketing
+        Optimize joins
+      SQL Operations
+        Select/From/Where
+          Most scalable
+          No shuffle needed
+        Group By/Join
+          Requires shuffle
+          Medium impact
+        Order By
+          Least scalable
+          Global sorting
+    Implementation Techniques
+      Date List Structure
+        Bit-based storage
+        30/31 day periods
+        Efficient querying
+      Array Metrics
+        Monthly arrays
+        Index-based dates
+        Efficient aggregation
+      Quality Guarantees
+        No duplicates
+        Required fields
+        Clean schemas
+    Business Applications
+      Long-term Analysis
+        Historical patterns
+        Slow decline detection
+        Multi-year trends
+      Dimensional Analysis
+        User segmentation
+        Geographic patterns
+        Device patterns
+      Performance Benefits
+        Faster queries
+        Lower storage costs
+        Better scalability
+```
+
+
+**Big picture:** Fact data modeling requires careful consideration of volume, performance, and usability tradeoffs. Three main approaches exist, each with distinct advantages for different use cases.
+
+**Key modeling approaches:**
+- Raw facts: Highest granularity, largest volume
+- Daily aggregates: Medium volume, good for 1-2 year analyses
+- Reduced facts: Lowest volume, best for long-term analysis
+
+**Performance impacts:**
+- Shuffling is a major bottleneck in distributed computing
+- SQL operations affect parallelism differently:
+  - SELECT/FROM/WHERE: Highly parallel
+  - GROUP BY/JOIN: Requires shuffling
+  - ORDER BY: Least parallel, avoid for large datasets
+
+**Innovation highlight - Reduced facts:**
+- Stores data as arrays indexed by date
+- Reduces storage by ~95%
+- Enables decade-long analyses in hours vs weeks
+- Maintains daily granularity while minimizing volume
+
+**Implementation techniques:**
+- Use array types for efficient storage
+- Leverage bit operations for activity tracking
+- Apply careful date indexing for time-based queries
+- Pre-aggregate data while maintaining granularity
+
+**Bottom line:** Successful fact data modeling requires balancing between data volume, query performance, and analytical flexibility. Reduced facts offer significant performance benefits for long-term analyses but require careful implementation.
+
+**Watch out for:** Dimensional joins can become complex with reduced facts. Consider analytical needs and access patterns when choosing modeling approach.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md b/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md
new file mode 100644
index 000000000..c7d8f1c34
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md
@@ -0,0 +1,96 @@
+# Data Engineering Boot Camp Week 2 Live Q&A Session and Black Friday Updates
+
+*A comprehensive Q&A session covering data engineering career paths, technology choices, and boot camp details.*
+
+
+```mermaid
+mindmap
+  root((Data Engineering))
+    Boot Camps
+      Free Boot Camp
+        Weeks 1-2 Completed
+          Data modeling
+          Facts & dimensions
+          Dateless structures
+        Week 3 upcoming
+          Spark content
+        Available until Jan 31 2025
+        Certification requires homework
+      Paid Boot Camp
+        Starts January 6th
+        Cloud focus
+          AWS
+          Databricks
+          Snowflake
+        Live classes
+          Tuesday Wednesday Thursday
+          6-8PM PST
+        Features
+          Mentorship
+          Capstone project
+          1 year access
+    Technologies
+      Data Warehousing
+        Snowflake
+        Databricks
+        BigQuery
+      Processing
+        Apache Spark
+        Apache Flink
+        Apache Kafka
+      Storage
+        S3/Blob Storage
+        Apache Iceberg
+        Delta Lake
+    Career Topics
+      Skills Needed
+        SQL
+        Python
+        Data Modeling
+        Cloud Platforms
+        Business Acumen
+      Job Market
+        Analytics Engineer trend
+        Platform Engineer trend
+        AI/ML integration
+        Interview preparation
+    Future Trends
+      AI Integration
+        LLMs in pipelines
+        Vector databases
+        Embeddings
+        RAG applications
+      Cloud Evolution
+        Serverless
+        Platform consolidation
+        Multi-cloud
+```
+
+
+**Big picture:** Week 2 covered fact data modeling, with Week 3 starting tomorrow focusing on Apache Spark. The January paid boot camp will feature cloud technologies like Databricks and Snowflake.
+
+**Key bootcamp details:**
+- Free boot camp available until January 31, 2025
+- Paid January boot camp runs January 6 - February 14
+- Live classes Tuesday-Thursday, 6-8 PM Pacific
+- 30% Black Friday discount available through December 2
+
+**Technology insights:**
+- Avoid technology preferences ("best" tools)
+- Focus on business requirements over specific platforms
+- Cloud providers (AWS/Azure/GCP) share similar core services
+- Orchestration tools (Airflow/Dagster/etc.) serve similar purposes
+
+**Career guidance:**
+- Data engineering requires SQL and Python foundations
+- Analytics background provides good business context
+- Technical depth matters more than breadth
+- Focus on building real projects over certifications
+
+**Boot camp structure:**
+- Homework assignments graded by AI
+- Certification requires completing all assignments
+- Mentorship available in paid version
+- Cloud infrastructure provided for hands-on practice
+
+**Bottom line:** Success in data engineering requires strong fundamentals, business understanding, and continuous learning rather than focusing on specific tools or certifications.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md b/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md
new file mode 100644
index 000000000..133b5058b
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md
@@ -0,0 +1,102 @@
+# Building Gold-Standard Data Pipelines: The Airbnb Midas Process
+
+
+```mermaid
+mindmap
+  root((Building High
+    Quality Data
+    Pipelines))
+    (Documentation & Trust)
+        [Good Documentation]
+            (Spec Review Process)
+                (Technical Review)
+                (Stakeholder Review)
+            (Flow Diagrams)
+            (Schema Documentation)
+            (Quality Checks)
+            (Metric Definitions)
+            (Example Queries)
+        [Building Trust]
+            (Stakeholder Involvement)
+            (Clear Business Impact)
+            (Consider Future Needs)
+            (Validate with Partners)
+    (Midas Process Steps)
+        [1. Create Spec]
+        [2. Spec Review]
+        [3. Build & Backfill]
+        [4. SQL Validation]
+        [5. Manura Validation]
+        [6. Data Review]
+        [7. Code Review]
+        [8. Metric Migration]
+        [9. Launch PSA]
+    (Data Quality)
+        [Basic Checks]
+            (Not Null)
+            (No Duplicates)
+            (Valid Enums)
+            (Data Exists)
+        [Intermediate Checks]
+            (Week over Week Counts)
+            (Seasonality)
+            (Row Count Trends)
+        [Advanced Checks]
+            (Machine Learning)
+            (Complex Relationships)
+            (Seasonality Adjusted)
+    (Schema Best Practices)
+        [Naming Conventions]
+            (fact_ prefix)
+            (dim_ prefix)
+            (scd_ prefix)
+            (agg_ prefix)
+        [Documentation]
+            (Column Comments)
+            (Table Comments)
+            (Business Context)
+        [Quality Standards]
+            (Dimension Checks)
+            (Fact Checks)
+            (Relationship Validation)
+    (Business Value)
+        [Direct Revenue]
+        [Cost Savings]
+        [Strategic Decisions]
+        [Leading Indicators]
+    (Metrics)
+        [Guardrail Metrics]
+            (Critical Business KPIs)
+            (Launch Blockers)
+        [Non-Guardrail Metrics]
+            (Informational)
+            (Contextual)
+```
+
+
+**Why it matters**: High-quality data pipelines are crucial for building trust and driving business value. Airbnb's Midas process offers a comprehensive framework for creating reliable, long-lasting data pipelines.
+
+**The big picture**: The Midas process consists of 9 key steps:
+1. Create a spec
+2. Get technical and stakeholder reviews
+3. Build and backfill pipeline
+4. SQL validation
+5. Manura validation (metrics)
+6. Data review
+7. Code review
+8. Migrate metrics
+9. Launch PSA
+
+**Key insights**:
+* Good documentation upfront prevents painful backfills and builds stakeholder trust
+* Not every pipeline needs the full Midas treatment - reserve it for critical, long-term data assets
+* Strong specs include flow diagrams, schema definitions, quality checks, and example queries
+* Different quality checks are needed for dimension vs. fact tables
+* Use week-over-week rather than day-over-day comparisons for more reliable monitoring
+
+**What to watch**: Quality checks should include:
+* Basic checks (nulls, duplicates, enum values)
+* Intermediate checks (row counts, week-over-week comparisons)
+* Advanced checks (seasonality adjustments, machine learning)
+
+**Bottom line**: While the full Midas process may seem heavy, even implementing a few steps can dramatically improve data quality and stakeholder trust. The upfront investment in documentation and validation pays off in reduced maintenance and stronger analytics partnerships.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md b/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md
new file mode 100644
index 000000000..15afc3caf
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md
@@ -0,0 +1,110 @@
+# Advanced Spark Deep Dive: Performance Optimization & API Selection
+
+
+```mermaid
+mindmap
+  root((Advanced Spark))
+    Deployment Options
+      Spark Server
+        Submit via CLI
+        JAR deployment
+        Session ends after job
+        Better for production
+      Spark Notebooks
+        Interactive development
+        Persistent session
+        Needs manual termination
+        Risky for production
+    APIs
+      Spark SQL
+        Lowest barrier to entry
+        Best for quick iteration
+        Good for data scientist collaboration
+      DataFrame API
+        Modular code
+        Better testability
+        Middle ground approach
+      Dataset API
+        Schema enforcement
+        Better null handling
+        Functional programming
+        Native language integration
+    Performance Optimization
+      Caching
+        Memory Only
+          Default caching strategy
+          Best for reused datasets
+        Disk Only
+          Not recommended
+          Use staging tables instead
+        When to use
+          Multiple usage of dataset
+          Fits in memory
+      Broadcast Joins
+        For small datasets
+        Configurable threshold
+        No shuffle needed
+        Entire dataset copied
+      Bucket Joins
+        For large datasets
+        Reduces shuffling
+        Partitioned data
+        Better for production
+    UDFs
+      Python UDFs
+        Serialization overhead
+        Apache Arrow improvements
+        Good for simple operations
+      Scala UDFs
+        Better performance
+        Native integration
+        Preferred for complex operations
+    Best Practices
+      Memory Management
+        Executor memory limits
+        Driver memory configuration
+        Optimize for workload
+      Partitioning
+        Shuffle partitions
+        100-200MB per partition
+        Optimize file size
+        Consider data skew
+      Temporary Views
+        Similar to CTEs
+        Cache if reused
+        Manage memory usage
+```
+
+
+**Why it matters:** Understanding Spark's various APIs, caching strategies, and join optimizations is crucial for building efficient data pipelines at scale.
+
+**Key takeaways:**
+
+**Spark Server vs. Notebooks:**
+* Spark Server deployment (via spark-submit) better mirrors production behavior
+* Notebooks convenient for development but risky for production without proper CI/CD
+* Notebook caching state can mask performance issues
+
+**Memory & Performance:**
+* Caching is only beneficial when data is reused multiple times
+* Default executor memory (16GB) often wasteful - tune based on workload
+* Use memory-only caching instead of disk caching; prefer staging tables over disk cache
+* Broadcast joins effective for small datasets (typically <few GB)
+* Set appropriate number of shuffle partitions (aim for 100-200MB per partition)
+
+**API Selection Guide:**
+* Spark SQL: Best for quick iteration with data scientists, lowest barrier to entry
+* DataFrame API: Good for modular code and testing in PySpark
+* Dataset API: Best for Scala Spark, offers compile-time type safety and null handling
+
+**Python vs. Scala Spark:**
+* Performance gap between Python and Scala UDFs mostly closed by Apache Arrow
+* Dataset API (Scala-only) remains key advantage for Scala Spark
+* Choose PySpark unless working at extreme scale or requiring Dataset API features
+
+**Advanced Optimizations:**
+* Bucket joins dramatically improve performance for large-scale joins
+* Adaptive Query Execution (AQE) helps handle data skew automatically
+* Proper partitioning and bucketing can eliminate expensive shuffles
+
+**The bottom line:** Pick APIs and optimizations based on use case scale and team expertise. Start simple with Spark SQL or DataFrames, then optimize with advanced features like bucketing and caching when needed.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_master_data_contracts.md b/bootcamp/materials/0-smart-brevity-overview/week_3_master_data_contracts.md
new file mode 100644
index 000000000..798c07bb6
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_master_data_contracts.md
@@ -0,0 +1,111 @@
+# WAP (Write Audit Publish) Pattern for Data Quality Management
+
+
+```mermaid
+mindmap
+  root((Data Quality))
+    Data Contracts
+      WAP Pattern
+        Write to staging
+        Audit with quality checks
+        Publish to production
+        Pros
+            No bad data in production
+            Better for ad-hoc queries
+            Intuitive for downstream
+        Cons
+            Slower due to data movement
+      Signal Table Pattern
+        Write directly to production
+        Run quality checks
+        Update signal table
+        Pros
+            Faster execution
+            Less compute/IO
+        Cons
+            Risk of bad data queries
+            Less intuitive
+    Causes of Bad Data
+        Logging Errors
+            Schema changes
+            Button click issues
+            Software bugs
+        Snapshotting Errors
+            Missing dimensions
+            Missing users
+            Production data issues
+        Pipeline Mistakes
+            Join errors
+            Case statement issues
+            Non-idempotent pipelines
+        Third-party API Changes
+            Contract changes
+            Schema modifications
+    Validation Best Practices
+        Backfill small amount first
+        Check assumptions
+            Nulls
+            Duplicates
+            Enumerations
+            Time series
+            Row counts
+        Have others validate
+    Quality Checks
+        Blocking
+            Serious issues
+            Stops pipeline
+            Requires troubleshooting
+        Non-blocking
+            Data weirdness
+            Continues pipeline
+            Sends alerts
+    Data Propagation
+        Consequences
+            Affects downstream pipelines
+            Expensive to fix
+            Requires backfilling
+            Can impact thousands of tables
+        Risk Factors
+            Critical datasets
+            Heavily used tables
+            Deep dependency chains
+    Metric Definition
+        Avoid over-filtering
+        Keep metrics broad
+        Limit dimensional cuts
+        Watch for noise in narrow definitions
+```
+
+
+**Why it matters**: Poor data quality can destroy trust, waste time, and cost companies millions in cleanup efforts. The WAP pattern prevents 80-90% of data quality issues before they hit production.
+
+**The big picture**: Two main approaches exist for ensuring data quality:
+
+1. **WAP Pattern** (used by Netflix, Airbnb)
+   - Write data to staging
+   - Audit with quality checks
+   - Publish to production if checks pass
+   - Preferred for ad-hoc queries and data consistency
+
+2. **Signal Table Pattern** (used by Facebook)
+   - Write directly to production
+   - Run quality checks
+   - Update signal table to mark data as ready
+   - Faster but riskier for ad-hoc queries
+
+**Key causes of bad data**:
+- Logging errors and schema changes
+- Snapshotting issues
+- Production data quality problems
+- Pipeline mistakes
+- Insufficient validation
+- Third-party API changes
+
+**Best practices**:
+- Have someone else validate your assumptions
+- Start with small backfills (e.g., one month)
+- Implement both blocking and non-blocking quality checks
+- Keep metrics broad enough to avoid noise
+- Be extra careful with heavily-used datasets
+
+**Real-world impact**: At Facebook, one broken contract with their Dim All Users table affected 20,000 downstream pipelines, costing nearly $1 million to fix.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_spark_iceberg_memory_tuning_joins_partition.md b/bootcamp/materials/0-smart-brevity-overview/week_3_spark_iceberg_memory_tuning_joins_partition.md
new file mode 100644
index 000000000..76520024a
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_spark_iceberg_memory_tuning_joins_partition.md
@@ -0,0 +1,95 @@
+# Apache Spark Fundamentals: Core Concepts & Best Practices
+
+
+```mermaid
+mindmap
+  root("Apache Spark")
+    ("Architecture")
+      ("Driver")
+        ("Controls job execution")
+        ("Determines join strategies")
+        ("Settings")
+          ("spark.driver.memory")
+          ("memory overhead")
+      ("Executors")
+        ("Do actual work")
+        ("Settings")
+          ("spark.executor.memory")
+          ("executor cores")
+          ("memory overhead")
+      ("Plan")
+        ("Evaluated lazily")
+        ("Runs on write/collect")
+    ("Join Types")
+      ("Shuffle Sort Merge Join")
+        ("Least performant")
+        ("Most versatile")
+      ("Broadcast Hash Join")
+        ("One side must be small")
+        ("Up to 8-10GB limit")
+      ("Bucket Join")
+        ("Pre-sorted data")
+        ("Use powers of 2 buckets")
+    ("Data Management")
+      ("Shuffling")
+        ("Least scalable part")
+        ("Required for joins/groupBy")
+        ("Can use buckets to avoid")
+      ("Partitioning")
+        ("Default 200 partitions")
+        ("Linked to parallelism")
+      ("Skew Handling")
+        ("spark.sql.adaptive=true")
+        ("Random salting technique")
+        ("Filter outliers")
+    ("Best Practices")
+      ("Sort within partitions")
+      ("Minimize collect operations")
+      ("Partition on date")
+      ("Monitor file sizes")
+    ("Data Sources")
+      ("Data Lakes")
+      ("Databases")
+      ("File Systems")
+      ("REST APIs")
+        ("Be careful with memory")
+    ("Deployment")
+      ("Databricks")
+        ("Notebook friendly")
+      ("Unmanaged Spark")
+        ("Use spark-submit")
+        ("Better for unit testing")
+```
+
+
+**Why it matters**: Spark is a powerful distributed computing framework that has become the industry standard for processing big data, offering significant improvements over predecessors like Hadoop and Hive.
+
+**Key takeaways:**
+
+**Architecture basics:**
+- Driver (coach): Controls job execution and planning
+- Executors (players): Perform actual data processing
+- Plan: The sequence of operations to be executed
+
+**Memory management essentials:**
+- Driver memory typically defaults to 2GB, can go up to 16GB
+- Executor memory settings critical for performance
+- Best practice: Test with different memory levels (2GB-8GB) to find optimal setting
+
+**Join strategies matter:**
+- Shuffle sort merge join: Most versatile but least performant
+- Broadcast hash join: Efficient for small datasets (<8-10GB)
+- Bucket join: Great for multiple joins on same columns
+
+**Pro tips:**
+- Avoid `collect()` on large datasets - use `take()` or `show()` instead
+- Use `sortWithinPartitions()` rather than global `sort()` for better performance
+- Set partitioning based on date for optimal organization
+- Consider cardinality when sorting - low to high cardinality order improves compression
+
+**Smart details:**
+- Spark uses lazy evaluation - nothing runs until action is triggered
+- Adaptive query execution in Spark 3+ helps handle data skew automatically
+- File partitioning on date is crucial for managing large datasets efficiently
+
+**What's next**: Organizations are increasingly adopting Spark for its versatility and performance advantages, though implementation requires careful consideration of data volumes and processing patterns.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_testing_apache_spark_jobs_in_cicd.md b/bootcamp/materials/0-smart-brevity-overview/week_3_testing_apache_spark_jobs_in_cicd.md
new file mode 100644
index 000000000..98fa02a86
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_testing_apache_spark_jobs_in_cicd.md
@@ -0,0 +1,100 @@
+# Testing Spark Data Pipelines: From Development to Production
+
+
+```mermaid
+mindmap
+  root((Data Pipeline Testing))
+    (Where Quality Bugs are Caught)
+        [In Development]
+            (Best case scenario)
+            (Caught before deployment)
+            (Fixed without impact)
+        [In Production Pre-Publishing]
+            (During write-audit-publish)
+            (Caught by quality checks)
+            (Causes data delays)
+        [In Production Post-Publishing]
+            (Worst case scenario)
+            (Found by analysts/dashboards)
+            (Can persist for weeks)
+    (Software Engineering Standards)
+        [Higher Quality Than Data Engineering]
+            (Immediate consequences for failures)
+            (More mature field)
+            (More standardized practices)
+        [Testing Types]
+            (Unit Tests)
+            (Integration Tests)
+            (Production Quality Checks)
+        [Best Practices]
+            (DRY - Don't Repeat Yourself)
+            (YAGNI - You Aren't Gonna Need It)
+            (Care about efficiency)
+            (Design docs)
+    (Why Data Engineering Gets Riskier)
+        [ML Dependencies]
+            (Model training impacts)
+            (Revenue implications)
+        [Experimentation Impact]
+            (AB testing reliability)
+            (Decision making)
+        [Data-Driven Culture]
+            (Increased reliance on data)
+            (Higher stakes)
+    (Organizations Miss the Mark)
+        [Business Velocity vs Quality]
+            (Pressure to move fast)
+            (Technical debt)
+        [Analytics vs Engineering]
+            (Quick answers vs lasting solutions)
+            (Building infrastructure)
+        [Culture Issues]
+            (Move fast and break things)
+            (Lack of quality standards)
+    (Future of Data Engineering)
+        [More Engineering Focus]
+            (Software engineering mindset)
+            (Higher quality standards)
+        [Latency Improvements]
+            (Batch to streaming)
+            (Micro-batch processing)
+        [Quality Emphasis]
+            (Testing frameworks)
+            (Automated checks)
+        [Better Access]
+            (API-driven data products)
+            (Improved data modeling)
+```
+
+*Why it matters*: Data quality issues caught late in production can damage trust, impact business decisions, and create costly headaches. Proper testing in development is crucial for maintaining high-quality data pipelines.
+
+🔑 Key points:
+
+### Three scenarios for catching quality bugs:
+
+- **Best case**: Catching bugs in development before deployment
+- **Better case**: Catching issues in production during write-audit-publish process 
+- **Worst case**: Bugs bypass quality checks and surface in production dashboards
+
+### Software engineering mindset needed:
+
+- Code should be written for humans to read, not just machines to run
+- Favor loud failures over silent ones to catch issues early
+- Use unit tests and integration tests to verify pipeline logic
+- Follow DRY (Don't Repeat Yourself) principles
+
+### Why testing matters more now:
+
+1. Data engineering is becoming more like software engineering
+2. Business decisions increasingly rely on data quality
+3. Pipeline failures can significantly impact revenue and operations
+4. ML models depend on reliable data inputs
+
+### Practical implementation:
+
+- Use frameworks like Chisa for unit testing Spark pipelines
+- Separate transformation logic from pipeline execution code
+- Create test cases with sample input/output data
+- Validate data types and schema compatibility
+
+*The bottom line*: As data engineering evolves and becomes more critical to business operations, adopting software engineering best practices for testing is essential for building reliable, maintainable pipelines.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_w3_free_boot_camp_live_stream.md b/bootcamp/materials/0-smart-brevity-overview/week_3_w3_free_boot_camp_live_stream.md
new file mode 100644
index 000000000..168243f0b
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_3_w3_free_boot_camp_live_stream.md
@@ -0,0 +1,81 @@
+# Data Engineering Boot Camp Live Q&A Summary - Week 3/4 Update
+
+```mermaid
+mindmap
+  root((Data Engineering
+    Bootcamp))
+    Free Bootcamp
+      Week 1 & 2
+        Data Modeling
+      Week 3
+        Spark
+        Data Quality Patterns
+      Week 4
+        Flink
+        Analytical Patterns
+      Week 5
+        Data Pipeline Maintenance
+        Experimentation
+      Week 6
+        Data Communication
+        Visualization
+        LLM Bonus Material
+      Timeline
+        Available until January 31st
+        All content released by Dec 31st
+    Paid Bootcamp
+      Schedule
+        Starts January 6th
+        Ends February 14th
+        6-8 PM Pacific Time
+        Tuesday/Wednesday/Thursday
+      Tracks
+        Analytics Track
+          Week 1 Iceberg
+          Week 2 Airflow
+          Week 3 Basic Snowflake & DBT
+          Week 4 Advanced Snowflake & DBT
+          Week 5 Analytical Patterns
+          Week 6 LLMs & Graph Databases
+        Infrastructure Track
+          Week 1 Iceberg
+          Week 2 Airflow
+          Week 3 Basic Spark & Databricks
+          Week 4 Advanced Spark & Databricks
+          Week 5 Real-time Processing
+          Week 6 LLMs & Graph Databases
+      Benefits
+        Cloud Platform Access
+          AWS
+          Snowflake
+          Trino
+          Airflow
+        Mentorship
+        Live Speaker Sessions
+        Certification
+        13 Months Content Access
+```
+
+**Why it matters**: The free boot camp is entering its final weeks, covering data quality patterns, Apache Spark, and preparing for Week 4's focus on Flink, Kafka, and analytical patterns.
+
+**Key updates:**
+* Content accessible until January 31st, 2024
+* Week 4 content on Flink and Kafka releasing tomorrow
+* Some labs from Week 3 were removed due to Astronomer Cloud IDE deprecation
+* Week 4 includes both real-time processing and analytical patterns (growth accounting, J curves, retention)
+
+**January paid boot camp details:**
+* Starts January 6th through February 14th
+* Sessions Tuesday-Thursday, 6-8 PM Pacific
+* Cost: $2,000 (25% off with code "free boot camp")
+* Split tracks available:
+  - Analytics: Snowflake, DBT, analytical patterns
+  - Infrastructure: Spark, Databricks, real-time processing
+  - Optional Week 6 bonus on LLMs and graph databases
+
+**Notable insights:**
+* Companies still struggle with basic analytics despite AI hype
+* Data engineering roles splitting between platform/infrastructure and analytics paths
+* Bachelor's degree remains important for big tech roles - 70% of peers have master's degrees
+
+**The bottom line**: The free boot camp content ends December 31st, but participants have until January 31st to complete assignments. The paid boot camp offers deeper cloud integration, mentorship, and certification opportunities.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_4_data_engineer_design_patterns_at_meta_growth_accounting.md b/bootcamp/materials/0-smart-brevity-overview/week_4_data_engineer_design_patterns_at_meta_growth_accounting.md
new file mode 100644
index 000000000..ec1eedfef
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_4_data_engineer_design_patterns_at_meta_growth_accounting.md
@@ -0,0 +1,88 @@
+# Growth Accounting & Analytics Design Patterns at Meta
+
+```mermaid
+mindmap
+  root((Data Engineering
+    Analytical Patterns))
+    (Repeatable Analyses)
+        [Higher Level Abstraction]
+            (Reduces Cognitive Load)
+            (Streamlines Impact)
+            (Pipeline Writes Itself)
+        [Three Main Patterns]
+            (Aggregation Based)
+                (Group By Operations)
+                (Common Functions)
+                    (Sum)
+                    (Average)
+                    (Count)
+                    (Percentile)
+                (Root Cause Analysis)
+                    (Dimensional Breakdown)
+                    (Metric Movement Analysis)
+            (Cumulation Based)
+                (State Transition Tracking)
+                    (Growth Accounting)
+                        (New Users)
+                        (Retained)
+                        (Churned)
+                        (Resurrected)
+                        (Stale)
+                    (ML Model Health)
+                        (Classification Changes)
+                        (State Changes)
+                (Survivorship Analysis)
+                    (Retention Analysis)
+                    (Cohort Analysis)
+                    (J-Curves)
+            (Window Based)
+                (Time Comparisons)
+                    (Day over Day)
+                    (Week over Week)
+                    (Month over Month)
+                    (Year over Year)
+                (Rolling Calculations)
+                    (Rolling Sum)
+                    (Moving Averages)
+                (Rank Functions)
+    (Implementation Tips)
+        [Cumulative Table Design]
+        [Partition on Dimensions]
+        [Handle Missing Data]
+        [Date Math Considerations]
+```
+
+**The big picture:** This lecture covers essential data engineering design patterns used at major tech companies, focusing on growth accounting and survivorship analysis. These patterns help track user engagement, state changes, and retention metrics.
+
+**Key design patterns:**
+
+1. **Growth Accounting**
+- Tracks user state changes (new, retained, churned, resurrected, stale)
+- Used at Facebook for tracking user activity and notifications
+- Applied to ML model monitoring at Netflix and Airbnb
+- Built on cumulative table design principles
+
+2. **Survivorship Analysis**
+- Measures retention rates over time (J-curves)
+- Used for cohort analysis and user engagement tracking
+- Applications beyond user metrics (medical survival rates, behavioral changes)
+- Key components: state check, reference date, and measurement period
+
+3. **Window-based Analysis**
+- Handles time-based comparisons (day-over-day, week-over-week)
+- Uses rolling sums and averages for trend analysis
+- Requires careful partitioning in big data environments
+
+**Why it matters:** These patterns:
+- Reduce cognitive load by providing higher-level abstractions
+- Enable faster pipeline development
+- Support better monitoring of ML models
+- Work across different business contexts
+
+**Real-world impact:** Companies use these patterns to:
+- Track user growth and engagement
+- Monitor ML model health
+- Measure intervention effectiveness
+- Support data-driven decision making
+
+**Bottom line:** Understanding these patterns helps data engineers build more efficient pipelines and provides powerful tools for analyzing user behavior and system performance.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_4_w4_free_boot_camp_live.md b/bootcamp/materials/0-smart-brevity-overview/week_4_w4_free_boot_camp_live.md
new file mode 100644
index 000000000..67e1c1fc2
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_4_w4_free_boot_camp_live.md
@@ -0,0 +1,92 @@
+# Data Engineering Live Q&A: Boot Camp Updates & Industry Insights
+
+```mermaid
+mindmap
+  root((Data Engineering Q&A))
+    Bootcamp Updates
+      Free Bootcamp
+        3 remaining videos
+          Data impact & visualization
+          Data pipeline maintenance
+          KPIs & experimentation
+        Available until Jan/Feb 2024
+      January Bootcamp
+        Duration: Jan 6 - Feb 14
+        Week 1: Iceberg & Data Modeling
+        Week 2: Airflow & Data Quality
+        Weeks 3-5: Split Tracks
+          Analytics Track
+            Snowflake
+            DBT
+          Infrastructure Track
+            Databricks
+            Kafka & Spark Streaming
+        Week 6: LLMs & Vector DBs
+        Guest Speakers
+          Joe Reese
+          Sundus Khalid
+          Jason Reed
+          Shaka MR
+          Yz
+    Technology Insights
+      Data Storage
+        Iceberg vs Delta Lake
+        Vector Databases
+        Graph Databases
+      Real-time Processing
+        Kafka
+        Flink
+        Spark Streaming
+      Data Modeling
+        Data Vault
+        Data Mesh
+      Tools & Platforms
+        DBT
+        Airflow
+        Spark
+        Snowflake
+    Career Advice
+      Entry Level
+        Portfolio projects
+        Apply extensively
+        Network actively
+      Experience Level
+        Data structures important
+        Mentorship crucial
+        Cloud certifications
+      Company Recommendations
+        New Grads
+          Google
+          Databricks
+          OpenAI
+          Anthropic
+        Experienced
+          Netflix
+          Airbnb
+          Meta
+    Future Trends
+      Cloud-based solutions
+      Stream-batch unification
+      Vector search
+      RAG architecture
+      Real-time analytics
+```
+
+**The big picture:** Zach Wilson hosted a 90-minute Q&A session from San Francisco, covering the free boot camp wrap-up, January boot camp launch, and various data engineering topics.
+
+**Key announcements:**
+- Free boot camp has 3 remaining videos: data impact/visualization, pipeline maintenance, and KPIs/experimentation
+- January boot camp (Jan 6 - Feb 14) features high-profile speakers including Joe Reese, Sundus Khalid, and Jason Reed
+- Special 30% discount offered (limited spots) for January boot camp enrollment
+
+**Technology takes:**
+- Recommends Iceberg over Delta Lake due to eventual merger post-DataBricks' $2B acquisition
+- Views Microsoft Fabric as serviceable but not optimal, comparing it to "Microsoft SQL Server of data platforms"
+- Emphasizes Scala's continued relevance for high-scale pipelines due to performance advantages
+
+**Career insights:**
+- Data structures and algorithms knowledge crucial for high-paying positions ($300K-600K range)
+- Google preferred over Netflix for new grads due to established training programs
+- Cloud certifications useful but overvalued compared to actual learning
+
+**What's next:** The free boot camp content will be taken down by early February 2024, with the paid January boot camp accepting enrollments until January 4th.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_6_build_datadriven_business_value_with_tableau_viz.md b/bootcamp/materials/0-smart-brevity-overview/week_6_build_datadriven_business_value_with_tableau_viz.md
new file mode 100644
index 000000000..ad6a60fe6
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_6_build_datadriven_business_value_with_tableau_viz.md
@@ -0,0 +1,114 @@
+# Impact and Data Visualization Best Practices for Data Engineers
+
+```mermaid
+mindmap
+  root((Data Engineering Impact))
+    Types of Impact
+      Measurable
+        Pipeline efficiency improvements
+        Metric movements through experiments
+        Direct revenue impact
+      Hard to Measure
+        Quality improvements
+          Error reduction
+          Data trust
+        Team enablement outcomes
+          Deployment time savings
+          Process improvements
+      Immeasurable
+        Changing intuitions
+        Team culture improvements
+        Being a glue person
+          Code sharing
+          Team collaboration
+          Technical vision
+    Creating Value
+      Supplying Insights
+        Reinforcing intuitions
+        Contradicting intuitions
+        Supporting decision-making
+      Maintaining Data Trust
+        Quality checks
+        Documentation
+        Clear expectations
+        Documenting gaps
+      Pipeline Efficiency
+        Proper data modeling
+        Reducing data volumes
+        Tool selection
+        Simplifying models
+      Enabling Engineers
+        Code review
+        Best practices
+        Framework development
+        Process improvement
+      Cultural Impact
+        Data-driven decisions
+        Quality focus
+        Long-term thinking
+    Dashboard Best Practices
+      Data Handling
+        Avoid joins on the fly
+        Pre-aggregate when possible
+        Use appropriate data stores
+      Design Principles
+        Know your audience
+          Executive dashboards
+            Simple
+            Non-interactive
+            Clear story
+          Analyst dashboards
+            Interactive
+            Detailed
+            Exploratory
+        Types of Analysis
+          Topline metrics
+          Time-based aggregates
+          Dimensional mix
+          Derivative metrics
+          Retention analysis
+```
+
+
+**The big picture:** Data engineering isn't just about moving data around—it's about driving real business impact through quality, efficiency, and enabling better decision-making.
+
+**Key insights:**
+
+* Types of impact:
+  - Measurable: Pipeline efficiency improvements, experiment metrics
+  - Hard to measure: Data quality improvements, team enablement
+  - Immeasurable: Changing leadership intuitions, being a "glue person"
+
+* How to deliver value:
+  - Supply high-quality insights to business
+  - Prevent bugs and maintain data trust
+  - Create efficient pipelines
+  - Enable other engineers to work faster
+  - Shift culture toward data-driven decisions
+
+**Best practices for dashboards:**
+
+* Executive dashboards:
+  - Keep it simple—minimal interactivity
+  - Focus on key metrics and trends
+  - Use clear annotations for important events
+  - Optimize for screenshot-ability
+
+* Analytical dashboards:
+  - Include multiple interactive filters
+  - Enable deep dives and exploration
+  - Show dimensional breakdowns
+  - Allow for root cause analysis
+
+**Why it matters:** 85% of dashboards are used only once or never again. The key to adoption is building performant, intuitive visualizations that tell clear stories.
+
+**Smart tips:**
+
+* Pre-aggregate data when possible
+* Don't do joins on the fly
+* Use appropriate data stores (not S3)
+* Match dashboard style to audience
+* Document gaps and limitations
+* Push back on low-value requests
+
+**Bottom line:** Impact comes from building sustainable, high-quality data solutions—not just churning out quick answers. Focus on long-term value over short-term fixes.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_6_maintain_data_pipelines_like_netflix_and_airbnb.md b/bootcamp/materials/0-smart-brevity-overview/week_6_maintain_data_pipelines_like_netflix_and_airbnb.md
new file mode 100644
index 000000000..e9a3def58
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_6_maintain_data_pipelines_like_netflix_and_airbnb.md
@@ -0,0 +1,46 @@
+# Managing Data Pipeline Maintenance and Technical Debt
+
+```mermaid
+
+```
+
+
+**The big picture:** Data engineering maintenance is an inevitable and growing cost that requires strategic management to avoid burnout and ensure long-term sustainability.
+
+**Key challenges:**
+
+* Every pipeline adds maintenance cost over time
+* 90%+ of data engineers report experiencing burnout
+* On-call responsibilities and late-night fixes create stress
+* Multiple stakeholders with urgent, competing priorities
+
+**Smart strategies for managing maintenance:**
+
+* On-call management:
+  - Set clear SLAs (24 hours vs 4 hours)
+  - Document every failure and bug
+  - Create detailed runbooks for complex pipelines
+  - Establish primary/secondary owners
+
+* Technical debt reduction:
+  - Dedicate time each quarter for cleanup (Tech Excellence Week)
+  - Have on-call engineers address tech debt during rotation
+  - Fix issues as you encounter them ("Boy Scout" approach)
+
+* Pipeline optimization:
+  - Delete unnecessary pipelines
+  - Migrate to more efficient technologies
+  - Implement sampling where appropriate
+  - Use bucketing for large-scale joins
+
+**Why it matters:** Without proper maintenance strategies, data teams face increasing technical debt, reliability issues, and eventual burnout.
+
+**Best practices for runbooks:**
+
+* List primary/secondary owners
+* Document upstream and downstream dependencies
+* Detail common issues and solutions
+* Specify SLAs and agreements
+* Include contact info for critical stakeholders
+
+**Bottom line:** Success in data engineering requires balancing new development with sustainable maintenance practices. Focus on documentation, clear ownership models, and regular technical debt reduction to avoid getting crushed by maintenance costs over time.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_6_new_year_and_free_boot_camp_wrap_up_live.md b/bootcamp/materials/0-smart-brevity-overview/week_6_new_year_and_free_boot_camp_wrap_up_live.md
new file mode 100644
index 000000000..e1f6c1bbe
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_6_new_year_and_free_boot_camp_wrap_up_live.md
@@ -0,0 +1,39 @@
+# Key Updates on Data Engineering Boot Camps and 2025 Outlook
+
+```mermaid
+
+```
+
+**The big picture:** Zach Wilson discussed plans for his data engineering educational content in 2025, including boot camp updates, certification requirements, and future offerings.
+
+**Free boot camp details:**
+* Available until January 31st, 2025
+* Full certification requires completing all assignments
+* Videos will be removed from YouTube after deadline
+* ~300 out of 30,000 registrants expected to complete certification
+* Certified students receive 1-year paid newsletter subscription
+
+**Paid boot camp highlights:**
+* Starts January 6th, 2025
+* 5-week intensive program, 15-20 hours/week
+* Live sessions Tuesday-Thursday, 6-8pm PST
+* Includes mentorship and capstone project
+* Covers: Iceberg, Trino, Airflow, Spark, Snowflake, DBT
+* Current 30% discount available (code: NEWYEAR30)
+
+**Why it matters:** The paid program offers hands-on cloud experience, mentorship, and advanced technical content beyond the free boot camp's fundamentals.
+
+**Key differences from free boot camp:**
+* Cloud infrastructure provided
+* Live instruction and mentorship
+* Industry-standard tools and platforms
+* Advanced technical content
+* Real-world project experience
+
+**Looking ahead:** 
+* Plans for AI/ML integration content
+* Potential new boot camps in Spring/Summer
+* Focus on data engineering's evolution with AI
+* Exploring university partnerships for formal degrees
+
+**Bottom line:** The free boot camp served as an introduction, while the paid program offers deeper technical expertise and career advancement opportunities. Focus in 2025 will be on emerging technologies and advanced data engineering skills.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_end_additional_jobready_capstone_projects_for_analytics_engineering.md b/bootcamp/materials/0-smart-brevity-overview/week_end_additional_jobready_capstone_projects_for_analytics_engineering.md
new file mode 100644
index 000000000..b9840de33
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_end_additional_jobready_capstone_projects_for_analytics_engineering.md
@@ -0,0 +1,107 @@
+# Innovative Data Engineering Capstone Projects Showcase
+
+```mermaid
+mindmap
+  root((Analytics Engineering
+    Bootcamp Projects))
+    (Big Bag Data)
+        [Luxury Handbag Resale Analysis]
+        [Python, Airflow, Iceberg, Spark]
+        [S3, DBT, Streamlit]
+        [25K rows daily data]
+        [Business Insights]
+            (Median market time 67 days)
+            (Popular brands analysis)
+            (Product type trends)
+    (Formula 1 Analytics)
+        [Real-time & Historical Analysis]
+        [Free Services Architecture]
+            (Kafka for real-time)
+            (SingleStore DB)
+            (Snowflake for historical)
+        [Grafana Dashboards]
+            (Driver performance)
+            (Sector analysis)
+            (Position evolution)
+            (Real-time replay)
+    (Energy Production Analysis)
+        [US Energy Info Admin Data]
+        [NOAA Weather Data]
+        [Snowflake & Snowpark]
+        [Tableau Dashboards]
+        [Geospatial Analysis]
+            (Power plant locations)
+            (Weather station mapping)
+    (Sports Analytics Platform)
+        [Real-time Sports Tracking]
+            (NBA)
+            (NHL)
+            (NFL)
+            (College Football)
+        [APIs Integration]
+            (ESPN API)
+            (Odds API)
+            (Weather API)
+        [Tech Stack]
+            (Kafka)
+            (Spark Streaming)
+            (Druid DB)
+            (Airflow)
+    (Chess Analysis)
+        [Chess.com API Integration]
+        [Grand Masters Analysis]
+            (1,800 accounts)
+            (5-6M games)
+        [Snowflake & DBT]
+        [Player Statistics]
+            (Game frequency)
+            (Performance metrics)
+            (Geographic distribution)
+    (Crypto Trading Analysis)
+        [Meme Coins Tracking]
+        [10-minute intervals]
+        [Price Analytics]
+            (All-time high tracking)
+            (Time-based changes)
+        [Wallet Analysis]
+            (Top holders tracking)
+            (Trading patterns)
+```
+
+**The big picture:** Six analytics engineering students showcased ambitious capstone projects, demonstrating real-world applications ranging from luxury handbag analytics to real-time Formula 1 racing dashboards.
+
+**Key projects:**
+
+* **Big Bag Data (Luxury Resale Analytics)**
+  - Built automated pipeline for handbag resale market analysis
+  - Integrated multiple data sources using Airflow, Spark, and Iceberg
+  - Delivered actionable insights on optimal pricing and inventory strategies
+
+* **Formula 1 Real-Time Analytics**
+  - Created zero-cost architecture using free cloud services
+  - Built real-time race monitoring with 1-second refresh rates
+  - Integrated historical analysis for race performance comparisons
+
+* **Energy Production Analysis**
+  - Combined power generation data with weather patterns
+  - Used geospatial analysis to match weather stations to power plants
+  - Built interactive Tableau dashboards for insights
+
+* **Sports Betting Platform**
+  - Integrated real-time odds and scores across multiple sports
+  - Built complete streaming pipeline using Kafka and Spark
+  - Created live betting probability calculations
+
+* **Chess.com Analytics**
+  - Analyzed 6 million games from grandmasters
+  - Built player segmentation and performance analytics
+  - Created geographical analysis of chess activity
+
+* **Crypto Trading Strategy**
+  - Tracked new token launches and market performance
+  - Built pipeline for monitoring price movements
+  - Developed analytics for identifying successful trading patterns
+
+**Why it matters:** These projects demonstrate how modern data engineering tools can transform raw data into valuable business insights across diverse domains.
+
+**Bottom line:** Students showed impressive technical depth and business acumen, creating production-ready solutions that could potentially evolve into commercial products.
\ No newline at end of file
diff --git a/bootcamp/materials/0-smart-brevity-overview/week_end_additional_master_realtime_data_pipelines_with_kafka_and_flink.md b/bootcamp/materials/0-smart-brevity-overview/week_end_additional_master_realtime_data_pipelines_with_kafka_and_flink.md
new file mode 100644
index 000000000..dff4e8e67
--- /dev/null
+++ b/bootcamp/materials/0-smart-brevity-overview/week_end_additional_master_realtime_data_pipelines_with_kafka_and_flink.md
@@ -0,0 +1,113 @@
+# Real-time Data Processing and Streaming Pipelines: A Practical Guide
+
+
+```mermaid
+mindmap
+  root((Streaming Data
+    Pipelines))
+    Processing Types
+        Streaming/Continuous
+            Processes data immediately
+            Example: Apache Flink
+        Near Real-Time/Micro-batch
+            Small batches with minutes delay
+            Example: Spark Structured Streaming
+        Batch Processing
+            Daily/Hourly runs
+            Traditional ETL
+    Streaming Use Cases
+        Streaming Only
+            Fraud Detection
+            High Frequency Trading
+            Sports Analytics
+        Gray Areas
+            Master Data Updates
+            Customer-Facing Data
+            Notification Systems
+    Architecture Components
+        Sources
+            Kafka
+            RabbitMQ
+            Dimensional Sources
+        Compute Engine
+            Apache Flink
+            Spark Streaming
+        Destinations
+            Kafka Topics
+            Data Lake
+            Postgres
+    Flink Features
+        Windows
+            Count Windows
+            Tumbling Windows
+            Sliding Windows
+            Session Windows
+        UDFs
+            Python UDFs
+            Java/Scala UDFs
+        Event Handling
+            Watermarking
+            Out-of-order Events
+            Late Arriving Data
+    Architectures
+        Lambda
+            Batch Pipeline
+            Streaming Pipeline
+            Data Quality Focus
+        Kappa
+            Streaming Only
+            Single Codebase
+            Uses Iceberg
+    Implementation Considerations
+        Team Skills
+            Streaming Experience
+            Maintenance Capability
+        Infrastructure
+            Memory Requirements
+            Compute Resources
+            Parallelism
+        Business Needs
+            Latency Requirements
+            Data Quality
+            Cost vs Benefit
+```
+
+
+🔑 **Why it matters:** Real-time data processing is transforming how businesses handle data, but implementing streaming solutions requires careful consideration of trade-offs between complexity and latency.
+
+**The big picture:** Streaming pipelines process data in low latency (minutes vs. daily batches), but they're significantly more complex than batch processing and require specialized skills.
+
+**Key insights:**
+
+- When stakeholders say "real-time," they rarely mean true streaming (only ~10% of cases). Most often they need reliable, predictable refresh rates
+- Streaming is essential for:
+  - Fraud detection
+  - High-frequency trading
+  - Live sports analytics
+  - Mission-critical customer-facing applications
+
+**Technical implementation:**
+
+- Apache Flink and Kafka are primary tools for streaming architectures
+- Key challenges include:
+  - Managing out-of-order events using watermarks
+  - Handling late-arriving data
+  - Recovering from failures effectively
+
+**Architecture choices:**
+
+- Lambda Architecture: Runs parallel batch and streaming pipelines
+  - Pros: Better data quality, reliable backup
+  - Cons: Maintains two codebases
+
+- Kappa Architecture: Streaming-only approach
+  - Pros: Single codebase
+  - Cons: More complex backfills, harder data quality checks
+
+**Smart advice:** Don't build streaming pipelines unless absolutely necessary. Consider:
+- Team's streaming expertise
+- Actual latency requirements
+- Pipeline homogeneity
+- Incremental benefit vs. complexity
+
+**Bottom line:** While streaming enables real-time data processing, it's often overused. Most use cases can be effectively served with batch or micro-batch processing, which are simpler to maintain and debug.
\ No newline at end of file

From 576226495d1a1de37d429b6fb0b94c40af16c3a8 Mon Sep 17 00:00:00 2001
From: John Bowyer <jcbowyer@hotmail.com>
Date: Fri, 24 Jan 2025 16:16:00 -0500
Subject: [PATCH 2/2] chore: Initialize pytest settings and add SQL homework
 solutions

---
 .vscode/settings.json                         |   7 +
 .../1-dimensional-data-modeling/README.md     |  40 +++-
 .../1-ddl-for-actors-table.sql                |  38 ++++
 .../2-cumulative-table-generation.sql         |  39 ++++
 .../3-ddl-for-actors-history.sql              |  26 +++
 .../4-backfill-actors-history-scd.sql         |  49 +++++
 .../5-incremental-actors-history-scd.sql      | 109 +++++++++++
 .../tests/test_1_dimensional_data_modeling.py | 174 ++++++++++++++++++
 .../src/__pycache__/__init__.cpython-310.pyc  | Bin 0 -> 196 bytes
 .../jobs/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 201 bytes
 ...monthly_user_site_hits_job.cpython-310.pyc | Bin 0 -> 1191 bytes
 .../players_scd_job.cpython-310.pyc           | Bin 0 -> 1841 bytes
 .../team_vertex_job.cpython-310.pyc           | Bin 0 -> 1186 bytes
 .../__pycache__/__init__.cpython-310.pyc      | Bin 0 -> 202 bytes
 .../conftest.cpython-310-pytest-8.3.4.pyc     | Bin 0 -> 600 bytes
 ...ser_site_hits.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1363 bytes
 ...st_player_scd.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1066 bytes
 ...am_vertex_job.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1191 bytes
 18 files changed, 481 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql
 create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/players_scd_job.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/__init__.cpython-310.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_player_scd.cpython-310-pytest-8.3.4.pyc
 create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_team_vertex_job.cpython-310-pytest-8.3.4.pyc

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..5718996c0
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "bootcamp"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/README.md b/bootcamp/materials/1-dimensional-data-modeling/README.md
index 5a6ec7480..40e28f717 100644
--- a/bootcamp/materials/1-dimensional-data-modeling/README.md
+++ b/bootcamp/materials/1-dimensional-data-modeling/README.md
@@ -76,7 +76,45 @@ There are two methods to get Postgres running locally.
         ```bash
         docker compose up -d
         ```
-        
+
+ ### 🐳 **Option 3: Run Postgres and PGAdmin in Docker Engine on WSL**
+ 
+- Install Docker in your WSL distribution
+
+    ```bash
+        sudo apt update
+        sudo apt install -y docker.io
+    ```
+
+- Install Docker Compose
+
+    ``` bash
+        sudo apt update
+        sudo curl -L "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+        sudo chmod +x /usr/local/bin/docker-compose
+    ```
+
+- Add user to docker group
+
+    ``` bash
+        sudo groupadd docker
+        sudo usermod -aG docker $USER
+    ```
+
+- Start and enable the Docker service
+
+    ```bash
+        sudo dockerd
+    ```     
+
+- Copy **`example.env`** to **`.env`** in a new terminal:
+    
+    ```bash
+        cd bootcamp/materials/1-dimensional-data-modeling
+        cp example.env .env
+        docker-compose up -d
+    ```
+
 - A folder named **`postgres-data`** will be created in the root of the repo. The data backing your Postgres instance will be saved here.
 - You can check that your Docker Compose stack is running by either:
     - Going into Docker Desktop: you should see an entry there with a drop-down for each of the containers running in your Docker Compose stack.
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql
new file mode 100644
index 000000000..c3e9ed144
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql
@@ -0,0 +1,38 @@
+DO $$ 
+BEGIN
+   EXECUTE 'DROP TYPE IF EXISTS quality_class_enum CASCADE';
+   EXECUTE 'DROP TYPE IF EXISTS film_struct CASCADE';
+   EXECUTE 'CREATE TYPE quality_class_enum AS ENUM (''star'', ''good'', ''average'', ''bad'')';
+   EXECUTE 'CREATE TYPE film_struct AS (
+       film VARCHAR(255),
+       votes INTEGER,
+       rating DECIMAL(3,1),
+       filmid UUID
+   )';
+END $$;
+
+DROP FUNCTION IF EXISTS films_avg_rating CASCADE;
+DROP TABLE IF EXISTS actors CASCADE;
+
+CREATE OR REPLACE FUNCTION films_avg_rating(films film_struct[])
+RETURNS DECIMAL(3,1) AS $$
+BEGIN
+   RETURN COALESCE((SELECT AVG(f.rating)::DECIMAL(3,1) FROM unnest(films) AS f), 0);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TABLE actors (
+   id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+   name VARCHAR(255) NOT NULL,
+   films film_struct[] NOT NULL,
+   quality_class quality_class_enum NOT NULL,
+   is_active BOOLEAN NOT NULL DEFAULT false,
+   CONSTRAINT rating_check CHECK (
+       quality_class = CASE 
+           WHEN films_avg_rating(films) > 8 THEN 'star'::quality_class_enum
+           WHEN films_avg_rating(films) > 7 THEN 'good'::quality_class_enum  
+           WHEN films_avg_rating(films) > 6 THEN 'average'::quality_class_enum
+           ELSE 'bad'::quality_class_enum
+       END
+   )
+);
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql
new file mode 100644
index 000000000..2f3dfd2de
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql
@@ -0,0 +1,39 @@
+TRUNCATE actors;
+
+WITH RECURSIVE years_desc AS (
+ SELECT MAX(year) as year
+ FROM actor_films
+ UNION ALL
+ SELECT year - 1
+ FROM years_desc 
+ WHERE year > (SELECT MIN(year) FROM actor_films)
+),
+yearly_films AS (
+ SELECT 
+   actorId,
+   actor,
+   year,
+   ARRAY_AGG(
+     ROW(film, votes, rating, gen_random_uuid())::film_struct  
+     ORDER BY rating DESC
+   ) as films,
+   AVG(rating)::DECIMAL(3,1) as avg_rating,
+   ROW_NUMBER() OVER (PARTITION BY actorId ORDER BY year DESC) as row_num
+ FROM actor_films 
+ GROUP BY actorId, actor, year
+)
+INSERT INTO actors (id, name, films, quality_class, is_active)
+SELECT 
+ gen_random_uuid(),
+ y.actor,
+ y.films,
+ (CASE 
+   WHEN y.avg_rating > 8 THEN 'star'::quality_class_enum
+   WHEN y.avg_rating > 7 THEN 'good'::quality_class_enum
+   WHEN y.avg_rating > 6 THEN 'average'::quality_class_enum
+   ELSE 'bad'::quality_class_enum
+ END),
+ row_num = 1
+FROM yearly_films y
+JOIN years_desc yd ON y.year = yd.year
+ORDER BY yd.year DESC;
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql
new file mode 100644
index 000000000..402e76763
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql
@@ -0,0 +1,26 @@
+DO $$ 
+BEGIN
+ IF NOT EXISTS (
+   SELECT 1 FROM information_schema.tables 
+   WHERE table_name = 'actors_history_scd'
+ ) THEN
+   CREATE TABLE actors_history_scd (
+     id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+     actor_id UUID NOT NULL REFERENCES actors(id),
+     quality_class quality_class_enum NOT NULL,
+     is_active BOOLEAN NOT NULL,
+     start_date TIMESTAMP WITH TIME ZONE NOT NULL,
+     end_date TIMESTAMP WITH TIME ZONE,
+     created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+     CONSTRAINT valid_dates CHECK (start_date <= COALESCE(end_date, 'infinity'::TIMESTAMP))
+   );
+ END IF;
+
+ IF NOT EXISTS (
+   SELECT 1 FROM pg_indexes 
+   WHERE tablename = 'actors_history_scd' 
+   AND indexname = 'idx_actors_history_dates'
+ ) THEN
+   CREATE INDEX idx_actors_history_dates ON actors_history_scd (actor_id, start_date, end_date);
+ END IF;
+END $$;
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql
new file mode 100644
index 000000000..d723e65fd
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql
@@ -0,0 +1,49 @@
+TRUNCATE actors_history_scd;
+
+WITH actor_class_changes AS (
+ SELECT 
+   actorId,
+   actor,
+   year,
+   AVG(rating)::DECIMAL(3,1) as avg_rating,
+   CASE 
+     WHEN AVG(rating) > 8 THEN 'star'::quality_class_enum
+     WHEN AVG(rating) > 7 THEN 'good'::quality_class_enum
+     WHEN AVG(rating) > 6 THEN 'average'::quality_class_enum 
+     ELSE 'bad'::quality_class_enum
+   END as curr_quality_class,
+   year = (SELECT MAX(year) FROM actor_films) as curr_is_active,
+   ROW_NUMBER() OVER (PARTITION BY actorId ORDER BY year) as version_num,
+   LAG(CASE 
+     WHEN AVG(rating) > 8 THEN 'star'::quality_class_enum
+     WHEN AVG(rating) > 7 THEN 'good'::quality_class_enum
+     WHEN AVG(rating) > 6 THEN 'average'::quality_class_enum 
+     ELSE 'bad'::quality_class_enum
+   END) OVER (PARTITION BY actorId ORDER BY year) as prev_quality_class,
+   LAG(year = (SELECT MAX(year) FROM actor_films)) OVER (PARTITION BY actorId ORDER BY year) as prev_is_active
+ FROM actor_films
+ GROUP BY actorId, actor, year
+)
+INSERT INTO actors_history_scd (
+ actor_id,
+ quality_class,
+ is_active,
+ start_date,
+ end_date
+)
+SELECT 
+ a.id as actor_id,
+ c.curr_quality_class,
+ c.curr_is_active,
+ make_timestamp(c.year, 1, 1, 0, 0, 0)::timestamptz as start_date,
+ CASE 
+   WHEN LEAD(c.year) OVER (PARTITION BY c.actorId ORDER BY c.year) IS NOT NULL 
+   THEN make_timestamp(LEAD(c.year) OVER (PARTITION BY c.actorId ORDER BY c.year), 1, 1, 0, 0, 0)::timestamptz
+   ELSE NULL
+ END as end_date
+FROM actor_class_changes c
+JOIN actors a ON a.name = c.actor
+WHERE version_num = 1 
+  OR curr_quality_class != prev_quality_class 
+  OR curr_is_active != prev_is_active
+ORDER BY c.actorId, c.year;
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql
new file mode 100644
index 000000000..6ac72a8d7
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql
@@ -0,0 +1,109 @@
+CREATE TYPE actor_scd_type AS (
+   quality_class quality_class_enum,
+   is_active boolean,
+   start_year INTEGER,
+   end_year INTEGER
+);
+
+WITH latest_scd AS (
+ SELECT * FROM actors_history_scd 
+ WHERE EXTRACT(YEAR FROM end_date) = EXTRACT(YEAR FROM CURRENT_DATE) - 1
+ OR end_date IS NULL
+),
+historical_scd AS (
+ SELECT 
+   actor_id,
+   quality_class,
+   is_active,
+   EXTRACT(YEAR FROM start_date)::INTEGER as start_year,
+   EXTRACT(YEAR FROM end_date)::INTEGER as end_year
+ FROM actors_history_scd
+ WHERE EXTRACT(YEAR FROM end_date) < EXTRACT(YEAR FROM CURRENT_DATE) - 1
+),
+current_actors AS (
+ SELECT 
+   id as actor_id,
+   quality_class,
+   is_active,
+   EXTRACT(YEAR FROM CURRENT_DATE)::INTEGER as curr_year
+ FROM actors
+),
+unchanged_records AS (
+ SELECT
+   ca.actor_id,
+   ca.quality_class,
+   ca.is_active,
+   EXTRACT(YEAR FROM ls.start_date)::INTEGER as start_year,
+   ca.curr_year as end_year
+ FROM current_actors ca
+ JOIN latest_scd ls ON ls.actor_id = ca.actor_id
+ WHERE ca.quality_class = ls.quality_class 
+ AND ca.is_active = ls.is_active
+),
+changed_records AS (
+ SELECT
+   ca.actor_id,
+   UNNEST(ARRAY[
+     ROW(
+       ls.quality_class,
+       ls.is_active,
+       EXTRACT(YEAR FROM ls.start_date)::INTEGER,
+       EXTRACT(YEAR FROM ls.end_date)::INTEGER
+     )::actor_scd_type,
+     ROW(
+       ca.quality_class,
+       ca.is_active,
+       ca.curr_year,
+       ca.curr_year
+     )::actor_scd_type
+   ]) as records
+ FROM current_actors ca
+ LEFT JOIN latest_scd ls ON ls.actor_id = ca.actor_id
+ WHERE ca.quality_class != ls.quality_class
+ OR ca.is_active != ls.is_active
+),
+unnested_changed_records AS (
+ SELECT 
+   actor_id,
+   (records::actor_scd_type).quality_class,
+   (records::actor_scd_type).is_active,
+   (records::actor_scd_type).start_year,
+   (records::actor_scd_type).end_year
+ FROM changed_records
+),
+new_records AS (
+ SELECT
+   ca.actor_id,
+   ca.quality_class,
+   ca.is_active,
+   ca.curr_year as start_year,
+   ca.curr_year as end_year
+ FROM current_actors ca
+ LEFT JOIN latest_scd ls ON ca.actor_id = ls.actor_id
+ WHERE ls.actor_id IS NULL
+)
+INSERT INTO actors_history_scd (
+ actor_id,
+ quality_class,
+ is_active,
+ start_date,
+ end_date
+)
+SELECT 
+ actor_id,
+ quality_class,
+ is_active,
+ make_timestamp(start_year, 1, 1, 0, 0, 0)::timestamptz as start_date,
+ CASE 
+   WHEN end_year IS NOT NULL THEN make_timestamp(end_year, 12, 31, 23, 59, 59)::timestamptz
+   ELSE NULL
+ END as end_date
+FROM (
+ SELECT * FROM historical_scd
+ UNION ALL
+ SELECT * FROM unchanged_records
+ UNION ALL 
+ SELECT * FROM unnested_changed_records
+ UNION ALL
+ SELECT * FROM new_records
+) combined;
\ No newline at end of file
diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py
new file mode 100644
index 000000000..166f1d07c
--- /dev/null
+++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py
@@ -0,0 +1,174 @@
+import pytest
+from sqlalchemy import create_engine, text
+from datetime import date
+import json
+
+@pytest.fixture(scope="session")
+def db_engine():
+    return create_engine('postgresql://postgres:postgres@localhost:5432/postgres')
+
+def test_1_validate_actors_ddl():
+    expected_ddl = """
+    CREATE TYPE film_struct AS (
+        film VARCHAR(255),
+        votes INTEGER,
+        rating DECIMAL(3,1),
+        filmid UUID
+    );
+
+    CREATE TYPE quality_class_enum AS ENUM ('star', 'good', 'average', 'bad');
+
+    CREATE TABLE actors (
+        id UUID PRIMARY KEY,
+        name VARCHAR(255) NOT NULL,
+        films film_struct[] NOT NULL,
+        quality_class quality_class_enum,
+        is_active BOOLEAN NOT NULL
+    )
+    """
+    assert expected_ddl.strip() == "YOUR_DDL_HERE".strip()
+
+
+def test_2_cumulative_table_generation(db_engine):
+    with db_engine.connect() as conn:
+        test_data = [
+            (1, 'Actor1', 2022, json.dumps([
+                {"film": "Film1", "votes": 1000, "rating": 8.5, "filmid": "f1", "year": 2022},
+                {"film": "Film2", "votes": 1000, "rating": 7.5, "filmid": "f2", "year": 2021}
+            ])),
+            (2, 'Actor2', 2022, json.dumps([
+                {"film": "Film3", "votes": 1000, "rating": 6.5, "filmid": "f3", "year": 2022},
+                {"film": "Film4", "votes": 1000, "rating": 5.5, "filmid": "f4", "year": 2021}
+            ]))
+        ]
+        
+        for id, name, year, films in test_data:
+            conn.execute(
+                text("INSERT INTO actors (id, name, created_year, films) VALUES (:id, :name, :year, :films::jsonb)"),
+                {"id": id, "name": name, "year": year, "films": films}
+            )
+        conn.commit()
+        
+        cumulative_query = """
+        WITH yearly_stats AS (
+            SELECT 
+                a.id,
+                film_data->>'year' as year,
+                AVG((film_data->>'rating')::float) as avg_rating,
+                bool_or((film_data->>'year')::int = EXTRACT(YEAR FROM CURRENT_DATE)) as is_active
+            FROM actors a,
+                jsonb_array_elements(films) as film_data
+            WHERE (film_data->>'year')::int <= :target_year
+            GROUP BY a.id, film_data->>'year'
+        )
+        UPDATE actors a
+        SET quality_class = 
+            CASE 
+                WHEN ys.avg_rating > 8 THEN 'star'
+                WHEN ys.avg_rating > 7 THEN 'good'
+                WHEN ys.avg_rating > 6 THEN 'average'
+                ELSE 'bad'
+            END,
+            is_active = ys.is_active
+        FROM yearly_stats ys
+        WHERE a.id = ys.id
+        AND ys.year = :target_year
+        """
+        
+        conn.execute(text(cumulative_query), {"target_year": 2022})
+        conn.commit()
+        
+        result = conn.execute(text("SELECT id, quality_class, is_active FROM actors"))
+        actor_stats = {row[0]: (row[1], row[2]) for row in result}
+        
+        assert actor_stats[1][0] == 'star'
+        assert actor_stats[1][1] is True
+        assert actor_stats[2][0] == 'average'
+        assert actor_stats[2][1] is True
+
+
+def test_3_validate_scd_ddl():
+    expected_ddl = """
+    CREATE TABLE actors_history_scd (
+        actor_id BIGINT REFERENCES actors(id),
+        quality_class VARCHAR(10) CHECK (quality_class IN ('star', 'good', 'average', 'bad')),
+        is_active BOOLEAN NOT NULL,
+        start_date DATE NOT NULL,
+        end_date DATE,
+        is_current BOOLEAN NOT NULL,
+        CONSTRAINT valid_dates CHECK (end_date IS NULL OR end_date >= start_date)
+    )
+    """
+    assert expected_ddl.strip() == "YOUR_SCD_DDL_HERE".strip()
+
+def test_4_scd_backfill(db_engine):
+    with db_engine.connect() as conn:
+        backfill_query = """
+        INSERT INTO actors_history_scd (
+            actor_id, quality_class, is_active, start_date, end_date, is_current
+        )
+        SELECT DISTINCT ON (a.id)
+            a.id,
+            a.quality_class,
+            a.is_active,
+            make_date(a.created_year, 1, 1) as start_date,
+            NULL as end_date,
+            TRUE as is_current
+        FROM actors a
+        ORDER BY a.id, a.created_year DESC
+        """
+        
+        conn.execute(text(backfill_query))
+        conn.commit()
+        
+        result = conn.execute(text("SELECT COUNT(*) FROM actors_history_scd"))
+        assert result.scalar() == 2
+
+def test_5_scd_incremental_update(db_engine):
+    with db_engine.connect() as conn:
+        incremental_query = """
+        WITH updates AS (
+            SELECT 
+                a.id as actor_id,
+                a.quality_class,
+                a.is_active,
+                CURRENT_DATE as start_date
+            FROM actors a
+            JOIN actors_history_scd h
+                ON a.id = h.actor_id
+                AND h.is_current = TRUE
+            WHERE h.quality_class != a.quality_class
+                OR h.is_active != a.is_active
+        )
+        INSERT INTO actors_history_scd (
+            actor_id, quality_class, is_active, start_date, end_date, is_current
+        )
+        SELECT
+            u.actor_id,
+            u.quality_class,
+            u.is_active,
+            u.start_date,
+            NULL,
+            TRUE
+        FROM updates u
+        """
+        
+        # Update test data
+        conn.execute(text("""
+            UPDATE actors 
+            SET quality_class = 'good', is_active = false 
+            WHERE id = 1
+        """))
+        
+        conn.execute(text(incremental_query))
+        conn.commit()
+        
+        result = conn.execute(text("""
+            SELECT COUNT(*) 
+            FROM actors_history_scd 
+            WHERE actor_id = 1
+        """))
+        assert result.scalar() == 2
+
+if __name__ == "__main__":
+    pytest.main([__file__])
\ No newline at end of file
diff --git a/bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..312dfe4ffa5bd8977c9e6fe5055cb959f29ed45f
GIT binary patch
literal 196
zcmYk$u?@m75Cu>Nr9em-gu+}wix3mgFo1Q$M`9BDEZae12r8yv83v$b3o7gizI5+&
zzdCN3g~+)*?_JsN!hfoK?DE7qqs2iitNlZSQvdjL(6c4bbfJ+w1@M8-)TRg;6H0QO
zaYRA(==70!fJ|!9da#%%7#t(4W%L*Zd77L?OD>f%h8C1E&LpK){`#74EbD2cZFwNh
HGPU>uB?UCk

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3048d13ec785f2d26e8619b8ee5c0dda26d01d5f
GIT binary patch
literal 201
zcmYk$u?@m75Cu>Nr9em-gu+}wix3mgFo5O6N8-fxS+;}35L8UTG7Lb=7F5_3eCgil
zes$C|E0%M6-utrOh5uCfIOK_SMzIrHSH}knrT+0TFpx!1ctJxR1NcBAb}53ANpf_(
zc8Gxw(y5kcf=ue9^<ZVfprwhh<<ZM9@$>AIwCGYPW9VR{Hi8h^=_JI)Uti0eB{dt|
KmoMTXQ?oBdBQ~)B

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24a6e6a1b9b1ca38271f840798080ebb88500648
GIT binary patch
literal 1191
zcma)5OK%e~5VrS`q-iS<hgRakN*vlqO}CFLAlfKAq_#m#3nB=u+_jTzySv`nPODTY
z7r1ibh>+4Fe}*66D<^&eNJxxbR4UO*ox~H5$N6SF^9`S#t|E|+-#+z(i_mvtTown6
z$FO7oiXw_rl;IA>h&t5$fI1HKs1Ls`Ewxdt`~z;G8rD<of{24QlQQ9X48RxS!OqwX
zSm&U4<a`FWFYc*3!ed0;Gk_nVF&@DVK)o$;A0t!^K-$fXW@E>0jR}(RT=gTVh)@fc
z&)d73^NrTpMzh^$&i4})5h2L%&C=V2V5wFMhz#;U7J)-X-9$(gQ8K&<!17;!we~*%
z$zq<<n<A`Sk%H<c!UEVQJlSq-o<A%dMgz%2BooDKvey6VS#!G?Py%OAW|b%?a_}&i
zJGnP!{ZF@ByDx(E*O!Z99eT!=>p#?*qi18m0I(&tS&@>M?XawPnXqH+%EPqgX<wR-
z>IsNSx`Jd(J5*}_aKOayD-wo9zutv?p0SX!BbM@liLelSpT$as=3jM|_mZ43QSXzS
z?s0w)f>JTbiZFxFA|a^^SL-%Vy*tP$M9Gyolp+rI`JN1~PCo+QVlmY7lt<UxP=e&L
z%LN32VwDZ~Ov(;+%9WqhY88tqFk@I3Vp!_KvP@lv4aKKOp?y4dsB?zLu3_rfJ42^f
z`6CS6Em7|bkI;Mf)EPPZ<#C1jaASg&45KMB1G05eU0GULtuHOZSNkcCN!po&iVvBP
zk&LOCLZm175fjG(y56~a&5KB{SE~v$pSrv^NK(p#E@ebQA$6G)#g^&OWRIzqXxP@;
z7iwRTy_9L+f@?41l<8`c1Ig6$T=AMKDgY@;pgI$Lpo#&EM%VWDTuqvYUPh9926pl`
zC=YwM1RoDOzwdZ6Sj@uV7LaDL7}{<w0*7p>HV5mzglHNeENtZg<TFUwBbaw2O%7A2
IQkuH;2fPqVlmGw#

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/players_scd_job.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/players_scd_job.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68c68399b1143070522d714de7a1a4d84ea429c8
GIT binary patch
literal 1841
zcmcIlL2uhO6eeZKc9eA3p=f*PfrkOJhNNoN9)qH8>b3?Pxv-sf%SB*B+EpS;DoM8p
zke;?{Px}MTaern%fY+V+4~hZ>_DDO46=VZ8V9y{$z4!d_z3-7v#f=+20{QsIZ<Eg+
zLVuc<RdL|uF%0<~Objucp(#GV7%}I-VKwIdiVkYbV|94CtT9AE^AmWY0PBrmNyYJy
zOPLlqedr?O`>(sB7ep$->2WL-6^b*`9+H+%K!%-OXLm%(jGk~2=XA<9t=`x`l9>=Z
zSFz+&7J0=alR~8VBu+9aWyR8KKW|m7WIG`JSDnEsw1IEEY!61=QMVtFCqJ(SLHdKI
zoq<tSBM8XOBT~UOFd$cv`vda7hS438XusFHx-Ia^QWhs~Xg=ZA_-xSMTV^KxV9C?6
z8Yu&A({Z|55Sz`SDi!d*tC01tFjA?7HCH*T+K2mlt=;ypLtek=L@U9Qk$K-HogUck
zlTP$BsKjG{O-cVt4XHdoj~5JxOK{6r^}H^O=j_qRMDPi%oMW5kzoG3PD7#u6QDtne
z8?~y82IfB5s}(Z{>xz5(r|aB#&aCtGd0+6tF73|;{r#8bN^EcbxcOj7iNAPWbB(Ll
zWOFq>Qr+AVeo3nDC#D0Pi&@uT?81;YVLHWg=t*Z7qf<1;$~nW(ft*M;7DrSDHGMmQ
zZlic#3^*tgKH}5zRm$J%ntYpS@9m6>lfcoQG~G(C0cmtBp!fWa#2W1IMTT#RDGwQc
z$Frj3A}mF5#1kb$<GRK3Nt$ymw&0qv!=gA2VNwa5mf;k(6)DYR_+ZPTZ;fX;1L#~C
zr4&hcR2<518EY&P7K7qmd7|&KB3|Ayqg8_Da$E>NgkJiC$sXD%^0AM_IxG?x7h*_)
zR=yx<2Nnt);kjdIn%5XmG<VO?2c*1HLt35H&hRhJ2YiZ;8uKP|!M<j^pv5B~(~Gp1
z6$#Ds`bwO_R6(y5@3?p`QpE#bm~D0AaF%9_3tgX53EArgEz1Z}*6S0l`eN7C!p5vU
zMGrHsJqw`SX~DSm(_C_)y16O>M>H0j%v3p3F&o=@EGF~Og1^u-4`C(0fyu>n__^5m
z`mK8li#wo-0uQ1-vE1K-YTFn^9V|8i>$3uC$}KE%a|fcCW&9Bam2ASafn2Bg{f74!
D{r2w(

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdd3e52d03f3501108047cecb18eabd69ac843ac
GIT binary patch
literal 1186
zcmYjR&2H2%5Vo_){)B=!paKU_4oGcK+X!(5L@5ib1iL7^P|-{9#<L}D{%U($Rw^gB
za^eNH+;|xtz*kPZ0umBqFNJmFvHi{D^Y|Ml)y0cG!S?z4m;J*AAwSGwxjDFah+W=7
z!wIK3Dd--hggbi<Z*cb$*=ul*H*s`%Ye>TOPmClX)holAiNhgCnN=nGN(u4(-QMU~
zpnw%J;lQUg@L+owtos4BVfT4=XB6Dt2;#wR(tq{hQ8!)>gTZSYUT(*u-l#X|2an#^
z_mhkVgZObbUIM~^$v{*mNjWVn<CA#sVzHYa`U`rCn*<C2Zk=%`RW`{$Y*^Qt@vRxp
zTQ#fU?E)=Wy~xqzxyHt00mm6rc&@dLV0pO+w^3!59+s>Cj5wbzdDBeI&|`)rlnE%A
zp)}(XJ_9C_Ni{7|G<5H54@c_;`wN+LhoiXHe;V3DnEgWo)}pFGs0^h4UyF0){oQBX
zxO*;Ga6h=?`>Cn-c{6jo6J!Uwyn^PG&hZAEQA$q9oGRyxV!fUIo@Xl>HuTk008=m!
zG1NIrVFX3}I)fA4kVm=pj;0`Hp`$%%Zien~rr1QFM&F2NBmXZMy{ig{I2=Q6GLP^8
z2aqZmnb1ur_p=f}Y`$Y9A6M03ghr*TsG|b66&cHAba&ITZ%$Axq%M^KrAVWLYAhpL
z`eW1t-X{ob)iZsASIIK01S{pF5=e;!zHSchqVcIuh51-g?Ejk4$ir?mx`z`*4(Qx5
zD$N@l6`H$e<O5OOsnM*-8)x*R^MRhygVwywUG#TM6n;+xwe<Sh%d3>-`U2`V10fTc
za^3p7(47iXKM|RN&=+QF-5O7`oP*HKf=QG>w^&{G@!a~tKB$4%u?4YtYfrIp4%)K>
z+AS&$+RsXfYV=A~g^p+%%1$*^^;9K%VvDjc9piBemX$YgCNH6JX%iopI=?Twt5jS=
ws2_Tm`OFqz3rlWu6h<GLfsI)rH8mHOxqT0_nda~S{}MzRe1*79dv(S814%AJ761SM

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/__init__.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7e6090b3b690570d75195e1cf374ae4a04b7e74
GIT binary patch
literal 202
zcmYk0y$!-J5QQC-0wHA(3UdW5LQFuz0M-#7iAnq;`+~#}R7}A#3_!~kRM-_f>3vV{
z6{qt}vk+Og=e;ZWF8EiK%Pu?B8Co2~vf4jHEb|W^3%$1lnl3c@KoLUpXKFctMn*}l
zGma?A9-TgVKR^bzXaiVeiU!97YncMZL7pb3(URjbrq}{g%7>IP&ahIeaDB};*6V4c
LZFwWjGHUS!hFmu{

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..046790b76730c36e243eef22acb14fcd0a5c0ab3
GIT binary patch
literal 600
zcmYjOy^a$x5VrTnyZJdvBwD%@$riK-opcgqDAKu!eCtWFiM=+q1EhxbDqi5Oymvu7
zfLp42g_DpNZ{WC*JpcR6H?!vAVoXqOetzHnK>I5N&n3s;5}$vMN)kyG;dHQ>&ND)Z
zE_g}L6~-A>GF9oC*OUO;k7O>38!Ai5zLBq#k0o7^S@nQeWI3a8y0XF@R^WYS`dIn1
zgN4$!#zLF=o=akLh0ni470906W|H2~+gxU-i#vko`7)Nuv_f&#?mBP9taKT+$Ljjn
zDG81<Eqp**32T=^1D|Xmd~jC|M1XtZD(oKAqi%LaLnGl6DCFZB>&zb7;2SA|m_xtq
zdH^@yiC$hCb7)Y7R%qL3gmRrwzIii84(>4D9D9kw`jC>|wM_tzK&|OF$y?YHZk;-d
zv|#!xMaC$j1<iiD5r(r#tkVji>wTPF+SB?edbPHv0YrEYF5od24nDgsKpZ`#V)ZXK
z#*b6`TDZv0E}z)bFn!@as=xQzbYH^J!7%*)QO;hCgkt?x8+laljwZ?}^_ooSID0|=
E0JjmQTmS$7

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..feae115187e05449f8f89cf39578ebc47424516c
GIT binary patch
literal 1363
zcmZ`(%Z}496t$hCkF?YBkdR=*2(h3@ZDEEbLLeSu140^B-b9|*Z8PMB?aa_oDj~sk
zHt++$j-TNJvSl?3K7g0Rwc80DA=2jd`uN^!``qJZ8yh}?=jhA(>CXm2U(I0sabR#4
zUPZxh#Bq!$w!b8C2*N0FsfW!Bi5t%mceoojlNN1R%!}KJPkoGp&l|k?hR_apE#3yN
z%YEK?i|7W&J7~~7hWhB~09W|>F##96Ydi~*EY-7k9u!K*Kt);vvq-BzOO~ohCKINk
zEQMG<Wr^Ur$YW9Uo-C7gApCKp53kxHfznLsO-K)zlx)7q8C1&wi%gB`-03WJZ#q4E
zxSp97$#U^YB$W<0l&B#k!Yt*dF+Gc<g+gr3wt8CdYu@|ts&`;GUSLiZM57W<fxu6=
z#N0h{R%35%Or{XCngbswg}AA^a7y$6m(EALa5X7W>6ZA7v+%S7(M>})_m5l{5#F&M
z7){BdVY3>L)zoh3mCzp3G}DZyhyK=$t?l6r_})D6Z-GB>q&dv7HAWi)r|PVcMB6pL
zw0D|dNHyn{CQG)#RA>np{}{A~2Cg(>q~~f{Yo_V>8v9H2T42q!-QK{Hre4(#rGO?s
zV46LWpbzN+Dw}o<ra)29#YrGu7A%hRe9)>q1$uZv30JhAii4&4mcpTal;(wLyJobl
zvO<OevT2)z8uE?X7q4;=YQckj78gQ+*iQgH`HW;A7XI;iJj)U>=3-yOSuW%_m)Ra<
zsj;a%6zMcdg^<G;OZje=y%>YhAxrWxXkEyN#cI4gwEG-Rij>2#Qf)vb!?6|$3K{#K
zi!;jS)rC{@^_?=*t7l+b>|){&+`t68zlm3S4FZV8YR~=VU+UWE)!Z+#nuEAs?svb}
z!}2T?9rUW!ZV|;gN>$apo6m!KbTP>1W=o@GOW-b!q&Sdpd8>8}Rm~I3hh%#KCh5g+
z76BC_L*2x5I0&-@S|}vFyqE2&k(K7Ze;+VswNQ1!ERKPFSX4<5_GyJ<C1SU2d$wlI
cKuFcPon*X-#T|1CR2R$@)FoZ)6CZb-KiU(E`~Uy|

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_player_scd.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_player_scd.cpython-310-pytest-8.3.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1f18eb1fa42925c8e72f48a299d2d8750921f44
GIT binary patch
literal 1066
zcmY*YOOMkq5VrH^BTaV^a9PBm;zW_!9JnI1;wd4qLRwC_L|%K^W=Y~;yFjZ}LMm5I
z`~f)fXOMtfE*v=RUqB!+_C6?%Jo#okGnx53k^O!|aJ~QXVfw=-<fm1(7uv`^Zk?hK
zM9_ky)V;mZ_Xs6rkcQNrctz_u5xxkDcG*cg&K4HkGD;&#AQCOn-gs#bQAczU#Uc{D
zcO>l#dPs)x3FapUL)y?=CmupX*M-dLf-OL0nv2XRR_TdUB{R9K@I0zm3Bt_l0-E@l
zb36p5WwpF{7G@m=e6AE!Ce!Y0pt)3eHO+Xzv~Id*aa=4%+d=<%4YXm(WP%wOKqVaD
z)E0cUTKI0`zK>hKMPZOB;R%1^I}}`?P_&$_EgYH#Yq}yI>56`$YtMKqvhp_G+BZJN
z`<Nlv1P-^*KXkZ_OgcRE*MW--Fg7&7%3p;W&+?9Zb`F+1PjfzFPz;_(C0Lbz#R}8l
zv6LeJhA{tbb^jg{%lJ^1AS)O=EHn&5W!q_XxdLqX0W<88LLOCsR%N?vE+cbb37KT@
za?Xm}EQYNn)HPEt(w>%c#le!qn(nzWX}XLX^u?gG4{vJV1{{;D>HSkDpNZPRPy8~;
z3~NdRyn;g3ppsh2V~o*>%{zi>npXg7G-H)GlJZ4@!f;mB362M-oE18`J#rO|Ci6;Q
z1C_Bxt$1QUV<IcJ=4)Ipnk&v`hiwWKC^u*wEJt5M5qS}{Pw+cxMPBU1G^W9yDBPjy
z5*`nCn$FQYFHBzPCf=_X*@d>3)eD<>e99Y6?^wZWl^Z}-9F=Hl+UK?igIys^HG6!P
zV;5uF^~7F7#$|~-gG#R+%OgEL9a`=F5Hz@3YI<B21+s-{wF<E~cct#~T&<4dEwPI(
YP3KN2#k_#KcCvMh;s&uj+MZ+oFQ$4ptpET3

literal 0
HcmV?d00001

diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_team_vertex_job.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_team_vertex_job.cpython-310-pytest-8.3.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d21c612ea1e870bb99940260caa4670a7e43944
GIT binary patch
literal 1191
zcmY+D&2Aev5P-@3(a&lX2To8R1p+LBUR1zRisTZYD1y3ZZ@Jh=0A3c<awId}U2aLr
zi3IuNTz%>_hxFK&=>zcElV71l+YA}mak$vo;c#X+`*Eno<BZ_A{PVZv%Y=}>{N(nC
zaq=8r!%zq!XiXRm->8WrLP?Xbl=_vZ?i~{mi=^&112zb@v>rAY%P4_N^hEz6Vk6WB
zVu)HUGBJ8b*jUgbGRdznKY1~sj(&R;p^Dl=p-Kz9DbGM#cw1V{TeDEQ;Z~{^9kP};
zAndBE!DYwj`7$_ezLo-7D;E-UVb>iLomL$#NiZ()ew(KuuM`EJ&o!LMkg;fGb?Q?V
zm9*=EYiN0~2Cf%s)d~<5^)>EB55Gfv5Aih}3X5<=<N=6O^e%e4dH;I8jLzwne4tzU
zJ3WtVv?bf<B07(4jB$gT$f3yIr($#&FX?#_{1fy~ZL*EGsmR6nGWKW@*_{_xz91I=
za`cJ~UU99Z(q=P0;%)IWqN=2+)FwYX;RcFB1Oi&teL^<*k6aYj{w(n*Sn}Po$B!o|
z%Wnf=gKHuz@km%2n6TVKDR)#RvCBU5c8Le>vd_eiCzS2UWveul?W(T%yoO1tu@K_+
zDh=GiPuy~>xB-nVNVG@m_PH^5WTjY?@au}#xc#K(QqysLiW|1w3fC|WbKmF-pTiA#
zWibFF*&e*@pt2xJwXzuDMmGkr_=D(u-0AQ2>_jy%6L1E#Z<86;{u-mq%;!IVb}3r`
zeQ?5CF<0tzhQe07>1No7$duP+_T(TGcCf%k!5UiYjYd~93kDN;`9&+gR`Y4Mc3*{s
zU3)K~1sxjEUrF-~io4NWn*5ig8I7Wd;`2$Omi&|aoBW@R_o=>*j*~lXFki{q%GS92
zdABZaW)I44?YEhJ0*W_hs^N{6){_d1(d7EqwhV(qRb0CV)rrLantBd_ZCR-X4*)c~
zkAGmM;TG-6M?*M1H>y-!BR@=Qbc)4=0}TfrY7NwHiF@ec2H!VItZMkdx3tMoJR*6N
K(=5tp9)AQhTV`nh

literal 0
HcmV?d00001