From c01e91e9d8d67609309366e6cd1d1f6e9e8e614b Mon Sep 17 00:00:00 2001 From: John Bowyer Date: Fri, 24 Jan 2025 12:35:00 -0500 Subject: [PATCH 1/2] docs: Add smartbrevity overview materials for Data Engineering Boot Camp summaries. --- ...ng_boot_camp_kick_off_and_informational.md | 100 +++++++++++++++ ...eling_complex_data_types_and_cumulation.md | 113 +++++++++++++++++ ..._cumulative_dimensions_struct_and_array.md | 85 +++++++++++++ ...wly_changing_dimensions_and_idempotency.md | 94 ++++++++++++++ ...uilding_slowly_changing_dimensions_scds.md | 93 ++++++++++++++ ...ing_graph_databases_additive_dimensions.md | 98 ++++++++++++++ .../week_1_w_free_boot_camp_live_stream.md | 115 +++++++++++++++++ ...2_how_meta_models_big_volume_event_data.md | 120 ++++++++++++++++++ ...ek_2_w_free_boot_camp_live_black_friday.md | 96 ++++++++++++++ ...gold_pipeline_like_airbnb_midas_process.md | 102 +++++++++++++++ ...ce_spark_dataframe_dataset_udfs_caching.md | 110 ++++++++++++++++ .../week_3_master_data_contracts.md | 111 ++++++++++++++++ ...k_iceberg_memory_tuning_joins_partition.md | 95 ++++++++++++++ ...eek_3_testing_apache_spark_jobs_in_cicd.md | 100 +++++++++++++++ .../week_3_w3_free_boot_camp_live_stream.md | 81 ++++++++++++ ...sign_patterns_at_meta_growth_accounting.md | 88 +++++++++++++ .../week_4_w4_free_boot_camp_live.md | 92 ++++++++++++++ ...adriven_business_value_with_tableau_viz.md | 114 +++++++++++++++++ ..._data_pipelines_like_netflix_and_airbnb.md | 46 +++++++ ...ew_year_and_free_boot_camp_wrap_up_live.md | 39 ++++++ ...tone_projects_for_analytics_engineering.md | 107 ++++++++++++++++ ...ime_data_pipelines_with_kafka_and_flink.md | 113 +++++++++++++++++ 22 files changed, 2112 insertions(+) create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_master_data_contracts.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_spark_iceberg_memory_tuning_joins_partition.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_testing_apache_spark_jobs_in_cicd.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_3_w3_free_boot_camp_live_stream.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_4_data_engineer_design_patterns_at_meta_growth_accounting.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_4_w4_free_boot_camp_live.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_build_datadriven_business_value_with_tableau_viz.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_maintain_data_pipelines_like_netflix_and_airbnb.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_6_new_year_and_free_boot_camp_wrap_up_live.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_end_additional_jobready_capstone_projects_for_analytics_engineering.md create mode 100644 bootcamp/materials/0-smart-brevity-overview/week_end_additional_master_realtime_data_pipelines_with_kafka_and_flink.md diff --git a/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md b/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md new file mode 100644 index 000000000..6a3d37394 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_0_6week_free_data_engineering_boot_camp_kick_off_and_informational.md @@ -0,0 +1,100 @@ +# Free Data Engineering Boot Camp Kickoff Summary + +```mermaid +mindmap + root((Free Data Engineering Boot Camp)) + Program Structure [00:01:01] + Six weeks intensive + 1-2 hours daily commitment + Pre-recorded lessons + Two components per module + Lecture + Lab + AI-graded homework + Discord community support + Curriculum [00:06:02] + Data Modeling + Two weeks coverage + Foundation concepts + Data product focus + Analytical Patterns + Growth accounting + Advanced SQL + Window functions + KPIs & Experimentation + Metrics definition + Product thinking + Data Visualization + Communication + Tableau basics + Dashboard types + Infrastructure Track + Unit testing + Pipeline maintenance + Apache Spark fundamentals + Data quality patterns + Real-time pipelines + Certification Paths [00:19:40] + Watch-only certificate + Attendance tracking + Basic recognition + Full certification + Complete all homework + Watch all content + Expected 3-4% completion rate + Paid vs Free Differences [00:27:28] + Cloud infrastructure access + AWS deployment + One year access + Paid APIs available + Enhanced support + Weekly Q&A sessions + Industry expert speakers + Dedicated TA support + Additional content + Snowflake + Trino + DBT + Apache Iceberg + Capstone project + Dedicated feedback + Portfolio building + Job interview training +``` + + +*A comprehensive 6-week program launching online with daily content releases at 5 PM Pacific.* + +**Big picture:** Tech expert Zach is offering free data engineering training to help 1,000 engineers land jobs by February 15, with content available on YouTube until December 2025. + +**Key details:** +- 10,000+ enrolled students +- 1-2 hours daily commitment recommended +- All content pre-recorded and uploaded daily +- Includes AI-graded homework assignments +- Discord community support available + +**Core curriculum:** +- Data modeling (2 weeks) +- Analytical patterns and advanced SQL +- KPIs and experimentation +- Data visualization +- Infrastructure and pipeline maintenance +- Apache Spark fundamentals +- Real-time pipelines with Flink and Kafka + +**Success metrics:** Only about 300-400 out of 10,000 students expected to complete certification, requiring: +- Watching all videos +- Completing all homework assignments +- Active participation in community + +**What's different in paid version:** +- Cloud infrastructure access +- Weekly Q&As with Zach +- Industry expert speakers +- Additional tools: Snowflake, DBT, AWS Glue +- Direct TA support +- Capstone project mentorship +- Job interview training + +**Bottom line:** While free version offers substantial technical training, paid version ($) provides more hands-on support and cloud-based tools for job preparation. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md new file mode 100644 index 000000000..ded341c70 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_intro_data_modeling_complex_data_types_and_cumulation.md @@ -0,0 +1,113 @@ +# Data Modeling: Complex Types and Cumulative Tables Deep Dive + +*A comprehensive look at dimensional data modeling principles, focusing on the balance between data efficiency and usability.* + + +```mermaid +mindmap + root((Dimensional + Data + Modeling - Intro)) + (Understanding Dimensions) + (Identifier Dimensions) + (Uniquely identify entities) + (User ID) + (Social Security) + (Device ID) + (Attributes) + (Slowly Changing) + (Time dependent values) + (Can change over time) + (Fixed) + (Birthday) + (Phone manufacturer) + (Data Modeling Types) + (OLTP) + (Online Transaction Processing) + (Normalized) + (Minimal duplication) + (Fast single row operations) + (Master Data) + (Middle ground) + (Complete entity definitions) + (Reference data) + (OLAP) + (Online Analytical Processing) + (Denormalized) + (Optimized for analysis) + (Population level queries) + (Cumulative Table Design) + (Historical Analysis) + (State Transitions) + (Uses Today + Yesterday data) + (Full outer join approach) + (Drawbacks) + (Sequential backfilling) + (PII management challenges) + (Complex Data Types) + (Struct) + (Table within table) + (Different value types) + (Array) + (Ordered lists) + (Same data type) + (Map) + (Key-value pairs) + (Same value type) + (Data Consumer Types) + (Data Analysts) + (Need simple flat data) + (Easy to query) + (Data Engineers) + (Can handle complex types) + (Build downstream pipelines) + (ML Models) + (Need identifier + features) + (Flat structure preferred) + (Customers) + (Need visualizations) + (Charts over raw data) + (Compactness vs Usability) + (Most Compact) + (Compressed data) + (Online systems) + (Middle Ground) + (Arrays and structs) + (Master data) + (Most Usable) + (Flat structure) + (Analytics focused) +``` + + +**Big picture:** Data modeling strategies vary significantly based on end users' needs, from analysts requiring simple flat tables to engineers working with compressed complex data types. + +**Key dimensions types:** +- Identifier dimensions (unique entity IDs) +- Slowly changing dimensions (values change over time) +- Fixed dimensions (unchangeable values) + +**Data modeling layers:** +- OLTP (transactional): Optimized for single-record operations +- Master data: Middle ground, combines completeness with efficiency +- OLAP (analytical): Optimized for aggregation and analysis +- Metrics: Highest level of aggregation + +**Cumulative table design benefits:** +- Maintains complete historical records +- Enables efficient historical analysis +- Supports state transition tracking +- Reduces query complexity + +**Watch out for:** +- Sequential processing requirement limits parallel backfilling +- Privacy concerns with historical data retention +- Storage size growth over time +- Shuffle operations breaking data sorting in distributed systems + +**Complex data types tradeoffs:** +- Arrays: Best for ordered data, same-type elements +- Structs: Flexible "table within table" approach +- Maps: Dynamic key-value pairs with type restrictions + +**Bottom line:** Success in dimensional modeling requires understanding your data consumers and balancing compression efficiency with query usability. Master data serves as a critical middle ground between transactional and analytical needs. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md new file mode 100644 index 000000000..982044c0d --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d1_lab_data_modeling_cumulative_dimensions_struct_and_array.md @@ -0,0 +1,85 @@ +# Building Cumulative Tables with Complex Data Types: Lab Tutorial + +*A hands-on demonstration of creating efficient dimensional tables using PostgreSQL arrays and structs to track NBA player statistics over time.* + + +```mermaid +mindmap + root((Dimensional Data Modeling + Lab)) + (Data Structure) + [Player Seasons Table] + (Temporal Components) + (Player Attributes) + (Name) + (Height) + (College) + (Country) + (Draft Info) + (Season Stats) + (Games Played) + (Points) + (Rebounds) + (Assists) + (Data Types) + [Custom Types] + (season_stats struct) + (scoring_class enum) + (Table Design) + [Players Table] + (Primary Key) + (player_name) + (current_season) + (Non-temporal Columns) + (season_stats Array) + (Additional Metrics) + (years_since_last_season) + (scoring_class) + (Cumulative Pattern) + [Benefits] + (Maintains Data History) + (Efficient Joins) + (No Shuffling Required) + (Fast Analytics) + [Implementation] + (Yesterday Query) + (Today Query) + (Full Outer Join) + (Array Concatenation) + (Analytics Capabilities) + [Historical Analysis] + (Player Progress) + (Career Gaps) + (Performance Metrics) + [Data Transformations] + (Unnest Operations) + (Array Manipulations) +``` + + +**Big picture:** Converting season-by-season player statistics into a cumulative table using complex data types reduces data duplication and maintains data sorting efficiency while enabling quick historical analysis. + +**Key components:** +- Custom struct type for season statistics +- Array column to store multiple seasons +- Tracking columns for scoring class and years since last season +- Full outer join logic for cumulative updates + +**Implementation steps:** +- Create custom struct type for season stats (points, games, rebounds, assists) +- Build base table with player attributes and season stats array +- Implement incremental loading logic using full outer joins +- Add derived columns for player classification and activity tracking + +**Performance benefits:** +- No GROUP BY operations needed for historical analysis +- Maintains data sorting after joins +- Reduces storage through elimination of duplicated data +- Enables fast parallel processing + +**Real-world example:** +- Tracked Michael Jordan's career gap (1996-1997, returned 2001) +- Demonstrated scoring progression from first to last season +- Identified most improved players without expensive aggregations + +**Bottom line:** Complex data types with cumulative loading provide significant performance advantages for dimensional data that changes over time, while maintaining data usability through unnesting capabilities. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md new file mode 100644 index 000000000..a5606e912 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_intro_data_modeling_slowly_changing_dimensions_and_idempotency.md @@ -0,0 +1,94 @@ +# Idempotency and Slowly Changing Dimensions in Data Engineering + +*A comprehensive guide to building reliable data pipelines and handling temporal dimension changes.* + + +```mermaid +mindmap + root((Data Modeling & + SCDs Intro)) + )Idempotent Pipelines( + (Problems to Avoid) + Insert without truncate + Missing end date in queries + Incomplete partition sensors + Depends on past issues + Latest partition dependency + (Definition) + Same results regardless of + When ran + How many times ran + Time of day + (Benefits) + Reproducible results + Easier troubleshooting + Better unit testing + Fewer silent failures + )Slowly Changing Dimensions( + (Definition) + Attributes that change over time + Examples + Age + Phone preferences + Country + Food preferences + (Types) + Type 0 + Static dimensions + Never change + Type 1 + Only latest value + Not recommended for analytics + Type 2 + Start and end dates + Complete history + Airbnb gold standard + Type 3 + Original and current value + Limited history + (Implementation Methods) + Full history processing + Incremental loading + (Modeling Approaches) + Latest snapshot + Not recommended + Non-idempotent + Daily snapshots + Simple but storage heavy + Max's preferred approach + SCD modeling + Compressed history + Type 2 recommended +``` + + +**Big picture:** Data pipelines must produce consistent results regardless of when or how many times they run. Slowly changing dimensions (SCDs) need careful modeling to maintain data accuracy over time. + +**Idempotency essentials:** +- Pipelines should produce identical results whether run in production or backfill +- Multiple runs should not create duplicates +- Time windows must have clear boundaries +- All input dependencies must be verified + +**Common idempotency pitfalls:** +- Using INSERT without TRUNCATE +- Unbounded date ranges +- Missing partition sensors +- Relying on latest data instead of point-in-time +- Cumulative dependencies on non-idempotent sources + +**SCD modeling approaches:** +- Type 0: Fixed dimensions that never change +- Type 1: Latest value only (avoid for analytics) +- Type 2: Full history with start/end dates (recommended) +- Type 3: Original and current values only + +**Best practices for Type 2 SCDs:** +- Use start and end dates for each value period +- Mark current records with future end date +- Can be loaded via full rebuild or incremental updates +- Choose implementation based on data volume and business needs + +**Bottom line:** Idempotent pipelines and Type 2 SCDs are critical for reliable analytics, though implementation complexity should be balanced against business value and maintenance costs. + +**Watch out for:** Using latest snapshot dimensions in analytical pipelines - this breaks idempotency and can cause incorrect historical analysis. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md new file mode 100644 index 000000000..0b50a05ec --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d2_lab_data_modeling_building_slowly_changing_dimensions_scds.md @@ -0,0 +1,93 @@ +# Building Slowly Changing Dimensions (SCD) in Data Modeling + +*A technical deep dive into implementing SCD Type 2 for tracking historical changes in dimensional data using PostgreSQL.* + +```mermaid +mindmap + root((Data Modeling & + SCDs Intro Lab)) + Table Structure + Primary Key + Player Name + Start Season + Tracking Columns + Scoring Class + Is Active + Time Columns + Start Season + End Season + Current Season + Full History Approach + Window Functions + LAG for Previous Values + Partition by Player Name + Order by Season + Change Detection + Compare Current vs Previous + Generate Change Indicators + Streak Identification + Sum Change Indicators + Group Records by Streaks + Advantages + Simple to Understand + Good for Smaller Dimensions + Disadvantages + Memory Intensive + Processes All History + Multiple Window Functions + Incremental Approach + Components + Historical SCD + Last Season SCD + This Season Data + Record Types + Unchanged Records + Extend End Date + Changed Records + Close Old Record + Create New Record + New Records + Create First Record + Advantages + Less Data Processing + More Efficient + Better for Large Dimensions + Disadvantages + More Complex Logic + Sequential Dependencies + Harder to Backfill + Best Practices + Handle Null Values + Check Data Quality + Consider Data Volume + Document Assumptions + Test Edge Cases +``` + +**Big picture:** Two approaches demonstrated for tracking dimensional changes over time - a full historical rebuild and an incremental update method. + +**Key components of SCD Type 2:** +- Start and end dates for each dimension record +- Support for tracking multiple changing attributes +- Maintains complete historical record of changes +- Primary key based on entity name and start date + +**Full historical rebuild approach:** +- Uses window functions to detect changes +- Generates streak identifiers for tracking changes +- More memory-intensive but simpler to implement +- Works well for smaller dimensional tables (millions of records) +- Requires scanning all historical data + +**Incremental update method:** +- Processes only changed records and new data +- More complex query logic but better performance +- Handles three scenarios: + - Unchanged records (extend end date) + - Changed records (close old record, create new) + - New records (create initial record) +- Better for larger datasets but requires sequential processing + +**Bottom line:** Choice between approaches depends on data volume and processing requirements. Full rebuild is simpler but less efficient; incremental update is more complex but better performing at scale. + +**Watch out for:** Null handling in dimensional attributes can break comparison logic. Always validate assumptions about data quality when implementing either approach. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md b/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md new file mode 100644 index 000000000..e2c9994d1 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_d3_intro_data_modeling_graph_databases_additive_dimensions.md @@ -0,0 +1,98 @@ +# Graph Data Modeling and Advanced Dimensional Concepts + +*A deep dive into graph databases, additive dimensions, and flexible schema patterns for complex data modeling scenarios.* + + +```mermaid +mindmap + root((Dimensional Data + Modeling Graph Intro)) + (Graph Data Modeling) + (Focus on Relationships) + (Less Entity Focused) + (Connection-Centric) + (Schema Structure) + (Flexible Schema) + (Basic Components) + (Vertices) + (ID) + (Type) + (Properties) + (Edges) + (Subject ID & Type) + (Object ID & Type) + (Edge Type) + (Properties) + (Additive vs Non-Additive Dimensions) + (Additivity Concept) + (Time Window Dependency) + (Single Value Rule) + (Examples) + (Honda Cars vs Drivers) + (Age Groups) + (Device Users) + (Impact on Analytics) + (Count Metrics) + (Ratio Metrics) + (Aggregation Methods) + (Enums Power) + (Benefits) + (Built-in Data Quality) + (Static Fields) + (Documentation) + (Usage Guidelines) + (Less than 50 Values) + (Partition Values) + (Little Book of Pipelines) + (Source Functions) + (Shared Schema) + (Data Quality Checks) + (Partitioned Output) + (Flexible Data Types) + (Map Benefits) + (Schema Evolution) + (Dynamic Properties) + (Other Properties Column) + (Drawbacks) + (Poor Compression) + (Storage Overhead) +``` + + +**Big picture:** Graph data modeling shifts focus from entities to relationships, using a standardized structure to track how different objects connect and interact. + +**Key concepts:** +- Additive vs non-additive dimensions +- Enumerated types for data quality +- Flexible schemas using maps +- Graph database fundamentals + +**Additive dimensions:** +- Can aggregate subtotals directly +- Entity can only have one value at a time +- Examples: age groups, car counts +- Time window affects additivity + +**Non-additive dimensions:** +- Require count distinct operations +- Entity can have multiple values simultaneously +- Examples: platform users (iOS/Android), car drivers +- Impact ratio metrics and aggregations + +**Graph database structure:** +- Vertices (nodes): + - Identifier + - Type + - Properties map +- Edges (relationships): + - Subject and object identifiers/types + - Edge type (usually verbs) + - Relationship properties + +**Pattern highlight - "Little Book of Pipelines":** +- Uses enums to manage multiple data sources +- Shared schema with custom quality checks +- Scales well for large integration projects +- Examples: unit economics, infrastructure graphs + +**Bottom line:** Choose modeling approach based on use case - enums for limited values (<50), flexible schemas for varying attributes, and graph databases when relationships matter more than entities themselves. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md b/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md new file mode 100644 index 000000000..3a3e6ed97 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_1_w_free_boot_camp_live_stream.md @@ -0,0 +1,115 @@ +# Data Expert Free Boot Camp Live Q&A Highlights + +*A comprehensive Q&A session covering data engineering career paths, boot camp details, and industry insights.* + + +```mermaid +mindmap + root((Data Engineering)) + Career Growth + Skills + SQL & Python fundamentals + Cloud platforms + AWS primary focus + Apache tools + Spark + Kafka + Flink + Data formats + Parquet + ORC + Orchestration + Airflow + Portfolio Building + Build practical projects + Focus on impact + Show robustness + Demonstrate quality + Provide proof/visualization + Technologies + Cloud Platforms + AWS dominates 55% market + Snowflake for usability + Databricks for features + File Formats + Parquet preferred + ORC alternative + Iceberg for modern lakes + Databases + Vector DBs trending + Postgres versatile + Elastic for search + Industry Trends + AI Impact + Not replacing DE + Enhancing pipeline maintenance + Making work less stressful + Still needs data quality + Data Growth + More data expected + Growing importance of DE + Vector databases rising + Tools Evolution + Moving from Hadoop + Cloud-native solutions + Real-time processing + Best Practices + Pipeline Design + Include staging steps + Consider backfill efficiency + Monitor data quality + Handle failures gracefully + Learning Approach + Learn by doing + Build real projects + Network with seniors + Stay updated with trends + Career Development + Start as analyst if needed + Focus on fundamentals + Create content + Network actively + Work-Life + Time Management + Set boundaries + Balance learning & work + Prioritize important tasks + Skill Development + Continuous learning + Practical experience + Keep up with trends + Career Growth + Build portfolio + Network + Create content +``` + +**Big picture:** Zach Wilson addresses key questions about his free and paid data engineering boot camps, career transitions, and the future of data engineering amidst AI advancements. + +**Boot camp details:** +- Free content available until January 31, 2025 +- New fact data modeling content releasing tomorrow (5 hours) +- AI-graded homework assignments +- Certification opportunities +- Discord community support + +**Key career insights:** +- Focus on AWS over Azure/GCP due to market share (55%) +- Entry-level DE roles are limited; consider data analyst path first +- Core skills needed: SQL, Python, cloud platforms +- Practical experience trumps certifications + +**Technology perspectives:** +- Kafka remains exciting for real-time data +- Parquet vs ORC file formats both relevant +- DBT valuable for enforcing good practices +- Snowflake gaining momentum for usability + +**Business update:** +- Free boot camp expanding reach (15,000 daily active users) +- Paid boot camp ($2,000 with discount) filling up +- Content helping both beginners and senior engineers + +**Bottom line:** Success in data engineering requires continuous learning, practical experience, and understanding business needs. Focus on fundamentals before chasing new technologies. + +**Watch out for:** Time zone compatibility with live sessions (particularly challenging for European participants). \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md b/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md new file mode 100644 index 000000000..4a1c591ac --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_2_how_meta_models_big_volume_event_data.md @@ -0,0 +1,120 @@ +# Understanding Fact Data Modeling and Volume Optimization + +*A comprehensive exploration of fact data modeling techniques, focusing on data volume optimization and efficiency in large-scale systems.* + + +```mermaid +mindmap + root((Fact Data Modeling)) + Fact Fundamentals + Definition + Atomic events + Cannot be broken down further + Represents actions/occurrences + Characteristics + High volume + Immutable + Time-based + Context dependent + Components + Who fields + User IDs + Device IDs + Where fields + Location + Page/section + When fields + Timestamps + UTC standardization + What fields + Event types + Actions + How fields + Methods + Tools used + Data Volume Management + Raw Facts + Highest granularity + Most flexible + Highest volume + Daily Aggregates + Medium volume + Grouped by day + Preserves some granularity + Reduced Facts + Lowest volume + Array-based storage + Monthly/yearly grouping + Performance Optimization + Shuffle Minimization + Reduce data volume + Pre-bucketing + Optimize joins + SQL Operations + Select/From/Where + Most scalable + No shuffle needed + Group By/Join + Requires shuffle + Medium impact + Order By + Least scalable + Global sorting + Implementation Techniques + Date List Structure + Bit-based storage + 30/31 day periods + Efficient querying + Array Metrics + Monthly arrays + Index-based dates + Efficient aggregation + Quality Guarantees + No duplicates + Required fields + Clean schemas + Business Applications + Long-term Analysis + Historical patterns + Slow decline detection + Multi-year trends + Dimensional Analysis + User segmentation + Geographic patterns + Device patterns + Performance Benefits + Faster queries + Lower storage costs + Better scalability +``` + + +**Big picture:** Fact data modeling requires careful consideration of volume, performance, and usability tradeoffs. Three main approaches exist, each with distinct advantages for different use cases. + +**Key modeling approaches:** +- Raw facts: Highest granularity, largest volume +- Daily aggregates: Medium volume, good for 1-2 year analyses +- Reduced facts: Lowest volume, best for long-term analysis + +**Performance impacts:** +- Shuffling is a major bottleneck in distributed computing +- SQL operations affect parallelism differently: + - SELECT/FROM/WHERE: Highly parallel + - GROUP BY/JOIN: Requires shuffling + - ORDER BY: Least parallel, avoid for large datasets + +**Innovation highlight - Reduced facts:** +- Stores data as arrays indexed by date +- Reduces storage by ~95% +- Enables decade-long analyses in hours vs weeks +- Maintains daily granularity while minimizing volume + +**Implementation techniques:** +- Use array types for efficient storage +- Leverage bit operations for activity tracking +- Apply careful date indexing for time-based queries +- Pre-aggregate data while maintaining granularity + +**Bottom line:** Successful fact data modeling requires balancing between data volume, query performance, and analytical flexibility. Reduced facts offer significant performance benefits for long-term analyses but require careful implementation. + +**Watch out for:** Dimensional joins can become complex with reduced facts. Consider analytical needs and access patterns when choosing modeling approach. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md b/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md new file mode 100644 index 000000000..c7d8f1c34 --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_2_w_free_boot_camp_live_black_friday.md @@ -0,0 +1,96 @@ +# Data Engineering Boot Camp Week 2 Live Q&A Session and Black Friday Updates + +*A comprehensive Q&A session covering data engineering career paths, technology choices, and boot camp details.* + + +```mermaid +mindmap + root((Data Engineering)) + Boot Camps + Free Boot Camp + Weeks 1-2 Completed + Data modeling + Facts & dimensions + Dateless structures + Week 3 upcoming + Spark content + Available until Jan 31 2025 + Certification requires homework + Paid Boot Camp + Starts January 6th + Cloud focus + AWS + Databricks + Snowflake + Live classes + Tuesday Wednesday Thursday + 6-8PM PST + Features + Mentorship + Capstone project + 1 year access + Technologies + Data Warehousing + Snowflake + Databricks + BigQuery + Processing + Apache Spark + Apache Flink + Apache Kafka + Storage + S3/Blob Storage + Apache Iceberg + Delta Lake + Career Topics + Skills Needed + SQL + Python + Data Modeling + Cloud Platforms + Business Acumen + Job Market + Analytics Engineer trend + Platform Engineer trend + AI/ML integration + Interview preparation + Future Trends + AI Integration + LLMs in pipelines + Vector databases + Embeddings + RAG applications + Cloud Evolution + Serverless + Platform consolidation + Multi-cloud +``` + + +**Big picture:** Week 2 covered fact data modeling, with Week 3 starting tomorrow focusing on Apache Spark. The January paid boot camp will feature cloud technologies like Databricks and Snowflake. + +**Key bootcamp details:** +- Free boot camp available until January 31, 2025 +- Paid January boot camp runs January 6 - February 14 +- Live classes Tuesday-Thursday, 6-8 PM Pacific +- 30% Black Friday discount available through December 2 + +**Technology insights:** +- Avoid technology preferences ("best" tools) +- Focus on business requirements over specific platforms +- Cloud providers (AWS/Azure/GCP) share similar core services +- Orchestration tools (Airflow/Dagster/etc.) serve similar purposes + +**Career guidance:** +- Data engineering requires SQL and Python foundations +- Analytics background provides good business context +- Technical depth matters more than breadth +- Focus on building real projects over certifications + +**Boot camp structure:** +- Homework assignments graded by AI +- Certification requires completing all assignments +- Mentorship available in paid version +- Cloud infrastructure provided for hands-on practice + +**Bottom line:** Success in data engineering requires strong fundamentals, business understanding, and continuous learning rather than focusing on specific tools or certifications. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md b/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md new file mode 100644 index 000000000..133b5058b --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_3_build_a_gold_pipeline_like_airbnb_midas_process.md @@ -0,0 +1,102 @@ +# Building Gold-Standard Data Pipelines: The Airbnb Midas Process + + +```mermaid +mindmap + root((Building High + Quality Data + Pipelines)) + (Documentation & Trust) + [Good Documentation] + (Spec Review Process) + (Technical Review) + (Stakeholder Review) + (Flow Diagrams) + (Schema Documentation) + (Quality Checks) + (Metric Definitions) + (Example Queries) + [Building Trust] + (Stakeholder Involvement) + (Clear Business Impact) + (Consider Future Needs) + (Validate with Partners) + (Midas Process Steps) + [1. Create Spec] + [2. Spec Review] + [3. Build & Backfill] + [4. SQL Validation] + [5. Manura Validation] + [6. Data Review] + [7. Code Review] + [8. Metric Migration] + [9. Launch PSA] + (Data Quality) + [Basic Checks] + (Not Null) + (No Duplicates) + (Valid Enums) + (Data Exists) + [Intermediate Checks] + (Week over Week Counts) + (Seasonality) + (Row Count Trends) + [Advanced Checks] + (Machine Learning) + (Complex Relationships) + (Seasonality Adjusted) + (Schema Best Practices) + [Naming Conventions] + (fact_ prefix) + (dim_ prefix) + (scd_ prefix) + (agg_ prefix) + [Documentation] + (Column Comments) + (Table Comments) + (Business Context) + [Quality Standards] + (Dimension Checks) + (Fact Checks) + (Relationship Validation) + (Business Value) + [Direct Revenue] + [Cost Savings] + [Strategic Decisions] + [Leading Indicators] + (Metrics) + [Guardrail Metrics] + (Critical Business KPIs) + (Launch Blockers) + [Non-Guardrail Metrics] + (Informational) + (Contextual) +``` + + +**Why it matters**: High-quality data pipelines are crucial for building trust and driving business value. Airbnb's Midas process offers a comprehensive framework for creating reliable, long-lasting data pipelines. + +**The big picture**: The Midas process consists of 9 key steps: +1. Create a spec +2. Get technical and stakeholder reviews +3. Build and backfill pipeline +4. SQL validation +5. Manura validation (metrics) +6. Data review +7. Code review +8. Migrate metrics +9. Launch PSA + +**Key insights**: +* Good documentation upfront prevents painful backfills and builds stakeholder trust +* Not every pipeline needs the full Midas treatment - reserve it for critical, long-term data assets +* Strong specs include flow diagrams, schema definitions, quality checks, and example queries +* Different quality checks are needed for dimension vs. fact tables +* Use week-over-week rather than day-over-day comparisons for more reliable monitoring + +**What to watch**: Quality checks should include: +* Basic checks (nulls, duplicates, enum values) +* Intermediate checks (row counts, week-over-week comparisons) +* Advanced checks (seasonality adjustments, machine learning) + +**Bottom line**: While the full Midas process may seem heavy, even implementing a few steps can dramatically improve data quality and stakeholder trust. The upfront investment in documentation and validation pays off in reduced maintenance and stronger analytics partnerships. \ No newline at end of file diff --git a/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md b/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md new file mode 100644 index 000000000..15afc3caf --- /dev/null +++ b/bootcamp/materials/0-smart-brevity-overview/week_3_high_performance_spark_dataframe_dataset_udfs_caching.md @@ -0,0 +1,110 @@ +# Advanced Spark Deep Dive: Performance Optimization & API Selection + + +```mermaid +mindmap + root((Advanced Spark)) + Deployment Options + Spark Server + Submit via CLI + JAR deployment + Session ends after job + Better for production + Spark Notebooks + Interactive development + Persistent session + Needs manual termination + Risky for production + APIs + Spark SQL + Lowest barrier to entry + Best for quick iteration + Good for data scientist collaboration + DataFrame API + Modular code + Better testability + Middle ground approach + Dataset API + Schema enforcement + Better null handling + Functional programming + Native language integration + Performance Optimization + Caching + Memory Only + Default caching strategy + Best for reused datasets + Disk Only + Not recommended + Use staging tables instead + When to use + Multiple usage of dataset + Fits in memory + Broadcast Joins + For small datasets + Configurable threshold + No shuffle needed + Entire dataset copied + Bucket Joins + For large datasets + Reduces shuffling + Partitioned data + Better for production + UDFs + Python UDFs + Serialization overhead + Apache Arrow improvements + Good for simple operations + Scala UDFs + Better performance + Native integration + Preferred for complex operations + Best Practices + Memory Management + Executor memory limits + Driver memory configuration + Optimize for workload + Partitioning + Shuffle partitions + 100-200MB per partition + Optimize file size + Consider data skew + Temporary Views + Similar to CTEs + Cache if reused + Manage memory usage +``` + + +**Why it matters:** Understanding Spark's various APIs, caching strategies, and join optimizations is crucial for building efficient data pipelines at scale. + +**Key takeaways:** + +**Spark Server vs. Notebooks:** +* Spark Server deployment (via spark-submit) better mirrors production behavior +* Notebooks convenient for development but risky for production without proper CI/CD +* Notebook caching state can mask performance issues + +**Memory & Performance:** +* Caching is only beneficial when data is reused multiple times +* Default executor memory (16GB) often wasteful - tune based on workload +* Use memory-only caching instead of disk caching; prefer staging tables over disk cache +* Broadcast joins effective for small datasets (typically Date: Fri, 24 Jan 2025 16:16:00 -0500 Subject: [PATCH 2/2] chore: Initialize pytest settings and add SQL homework solutions --- .vscode/settings.json | 7 + .../1-dimensional-data-modeling/README.md | 40 +++- .../1-ddl-for-actors-table.sql | 38 ++++ .../2-cumulative-table-generation.sql | 39 ++++ .../3-ddl-for-actors-history.sql | 26 +++ .../4-backfill-actors-history-scd.sql | 49 +++++ .../5-incremental-actors-history-scd.sql | 109 +++++++++++ .../tests/test_1_dimensional_data_modeling.py | 174 ++++++++++++++++++ .../src/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 196 bytes .../jobs/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 201 bytes ...monthly_user_site_hits_job.cpython-310.pyc | Bin 0 -> 1191 bytes .../players_scd_job.cpython-310.pyc | Bin 0 -> 1841 bytes .../team_vertex_job.cpython-310.pyc | Bin 0 -> 1186 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 202 bytes .../conftest.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 600 bytes ...ser_site_hits.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1363 bytes ...st_player_scd.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1066 bytes ...am_vertex_job.cpython-310-pytest-8.3.4.pyc | Bin 0 -> 1191 bytes 18 files changed, 481 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql create mode 100644 bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py create mode 100644 bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/players_scd_job.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/__init__.cpython-310.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_player_scd.cpython-310-pytest-8.3.4.pyc create mode 100644 bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_team_vertex_job.cpython-310-pytest-8.3.4.pyc diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..5718996c0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "bootcamp" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/README.md b/bootcamp/materials/1-dimensional-data-modeling/README.md index 5a6ec7480..40e28f717 100644 --- a/bootcamp/materials/1-dimensional-data-modeling/README.md +++ b/bootcamp/materials/1-dimensional-data-modeling/README.md @@ -76,7 +76,45 @@ There are two methods to get Postgres running locally. ```bash docker compose up -d ``` - + + ### 🐳 **Option 3: Run Postgres and PGAdmin in Docker Engine on WSL** + +- Install Docker in your WSL distribution + + ```bash + sudo apt update + sudo apt install -y docker.io + ``` + +- Install Docker Compose + + ``` bash + sudo apt update + sudo curl -L "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + ``` + +- Add user to docker group + + ``` bash + sudo groupadd docker + sudo usermod -aG docker $USER + ``` + +- Start and enable the Docker service + + ```bash + sudo dockerd + ``` + +- Copy **`example.env`** to **`.env`** in a new terminal: + + ```bash + cd bootcamp/materials/1-dimensional-data-modeling + cp example.env .env + docker-compose up -d + ``` + - A folder named **`postgres-data`** will be created in the root of the repo. The data backing your Postgres instance will be saved here. - You can check that your Docker Compose stack is running by either: - Going into Docker Desktop: you should see an entry there with a drop-down for each of the containers running in your Docker Compose stack. diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql new file mode 100644 index 000000000..c3e9ed144 --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/1-ddl-for-actors-table.sql @@ -0,0 +1,38 @@ +DO $$ +BEGIN + EXECUTE 'DROP TYPE IF EXISTS quality_class_enum CASCADE'; + EXECUTE 'DROP TYPE IF EXISTS film_struct CASCADE'; + EXECUTE 'CREATE TYPE quality_class_enum AS ENUM (''star'', ''good'', ''average'', ''bad'')'; + EXECUTE 'CREATE TYPE film_struct AS ( + film VARCHAR(255), + votes INTEGER, + rating DECIMAL(3,1), + filmid UUID + )'; +END $$; + +DROP FUNCTION IF EXISTS films_avg_rating CASCADE; +DROP TABLE IF EXISTS actors CASCADE; + +CREATE OR REPLACE FUNCTION films_avg_rating(films film_struct[]) +RETURNS DECIMAL(3,1) AS $$ +BEGIN + RETURN COALESCE((SELECT AVG(f.rating)::DECIMAL(3,1) FROM unnest(films) AS f), 0); +END; +$$ LANGUAGE plpgsql; + +CREATE TABLE actors ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(255) NOT NULL, + films film_struct[] NOT NULL, + quality_class quality_class_enum NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT false, + CONSTRAINT rating_check CHECK ( + quality_class = CASE + WHEN films_avg_rating(films) > 8 THEN 'star'::quality_class_enum + WHEN films_avg_rating(films) > 7 THEN 'good'::quality_class_enum + WHEN films_avg_rating(films) > 6 THEN 'average'::quality_class_enum + ELSE 'bad'::quality_class_enum + END + ) +); \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql new file mode 100644 index 000000000..2f3dfd2de --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/2-cumulative-table-generation.sql @@ -0,0 +1,39 @@ +TRUNCATE actors; + +WITH RECURSIVE years_desc AS ( + SELECT MAX(year) as year + FROM actor_films + UNION ALL + SELECT year - 1 + FROM years_desc + WHERE year > (SELECT MIN(year) FROM actor_films) +), +yearly_films AS ( + SELECT + actorId, + actor, + year, + ARRAY_AGG( + ROW(film, votes, rating, gen_random_uuid())::film_struct + ORDER BY rating DESC + ) as films, + AVG(rating)::DECIMAL(3,1) as avg_rating, + ROW_NUMBER() OVER (PARTITION BY actorId ORDER BY year DESC) as row_num + FROM actor_films + GROUP BY actorId, actor, year +) +INSERT INTO actors (id, name, films, quality_class, is_active) +SELECT + gen_random_uuid(), + y.actor, + y.films, + (CASE + WHEN y.avg_rating > 8 THEN 'star'::quality_class_enum + WHEN y.avg_rating > 7 THEN 'good'::quality_class_enum + WHEN y.avg_rating > 6 THEN 'average'::quality_class_enum + ELSE 'bad'::quality_class_enum + END), + row_num = 1 +FROM yearly_films y +JOIN years_desc yd ON y.year = yd.year +ORDER BY yd.year DESC; \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql new file mode 100644 index 000000000..402e76763 --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/3-ddl-for-actors-history.sql @@ -0,0 +1,26 @@ +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'actors_history_scd' + ) THEN + CREATE TABLE actors_history_scd ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + actor_id UUID NOT NULL REFERENCES actors(id), + quality_class quality_class_enum NOT NULL, + is_active BOOLEAN NOT NULL, + start_date TIMESTAMP WITH TIME ZONE NOT NULL, + end_date TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT valid_dates CHECK (start_date <= COALESCE(end_date, 'infinity'::TIMESTAMP)) + ); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'actors_history_scd' + AND indexname = 'idx_actors_history_dates' + ) THEN + CREATE INDEX idx_actors_history_dates ON actors_history_scd (actor_id, start_date, end_date); + END IF; +END $$; \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql new file mode 100644 index 000000000..d723e65fd --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/4-backfill-actors-history-scd.sql @@ -0,0 +1,49 @@ +TRUNCATE actors_history_scd; + +WITH actor_class_changes AS ( + SELECT + actorId, + actor, + year, + AVG(rating)::DECIMAL(3,1) as avg_rating, + CASE + WHEN AVG(rating) > 8 THEN 'star'::quality_class_enum + WHEN AVG(rating) > 7 THEN 'good'::quality_class_enum + WHEN AVG(rating) > 6 THEN 'average'::quality_class_enum + ELSE 'bad'::quality_class_enum + END as curr_quality_class, + year = (SELECT MAX(year) FROM actor_films) as curr_is_active, + ROW_NUMBER() OVER (PARTITION BY actorId ORDER BY year) as version_num, + LAG(CASE + WHEN AVG(rating) > 8 THEN 'star'::quality_class_enum + WHEN AVG(rating) > 7 THEN 'good'::quality_class_enum + WHEN AVG(rating) > 6 THEN 'average'::quality_class_enum + ELSE 'bad'::quality_class_enum + END) OVER (PARTITION BY actorId ORDER BY year) as prev_quality_class, + LAG(year = (SELECT MAX(year) FROM actor_films)) OVER (PARTITION BY actorId ORDER BY year) as prev_is_active + FROM actor_films + GROUP BY actorId, actor, year +) +INSERT INTO actors_history_scd ( + actor_id, + quality_class, + is_active, + start_date, + end_date +) +SELECT + a.id as actor_id, + c.curr_quality_class, + c.curr_is_active, + make_timestamp(c.year, 1, 1, 0, 0, 0)::timestamptz as start_date, + CASE + WHEN LEAD(c.year) OVER (PARTITION BY c.actorId ORDER BY c.year) IS NOT NULL + THEN make_timestamp(LEAD(c.year) OVER (PARTITION BY c.actorId ORDER BY c.year), 1, 1, 0, 0, 0)::timestamptz + ELSE NULL + END as end_date +FROM actor_class_changes c +JOIN actors a ON a.name = c.actor +WHERE version_num = 1 + OR curr_quality_class != prev_quality_class + OR curr_is_active != prev_is_active +ORDER BY c.actorId, c.year; \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql new file mode 100644 index 000000000..6ac72a8d7 --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/5-incremental-actors-history-scd.sql @@ -0,0 +1,109 @@ +CREATE TYPE actor_scd_type AS ( + quality_class quality_class_enum, + is_active boolean, + start_year INTEGER, + end_year INTEGER +); + +WITH latest_scd AS ( + SELECT * FROM actors_history_scd + WHERE EXTRACT(YEAR FROM end_date) = EXTRACT(YEAR FROM CURRENT_DATE) - 1 + OR end_date IS NULL +), +historical_scd AS ( + SELECT + actor_id, + quality_class, + is_active, + EXTRACT(YEAR FROM start_date)::INTEGER as start_year, + EXTRACT(YEAR FROM end_date)::INTEGER as end_year + FROM actors_history_scd + WHERE EXTRACT(YEAR FROM end_date) < EXTRACT(YEAR FROM CURRENT_DATE) - 1 +), +current_actors AS ( + SELECT + id as actor_id, + quality_class, + is_active, + EXTRACT(YEAR FROM CURRENT_DATE)::INTEGER as curr_year + FROM actors +), +unchanged_records AS ( + SELECT + ca.actor_id, + ca.quality_class, + ca.is_active, + EXTRACT(YEAR FROM ls.start_date)::INTEGER as start_year, + ca.curr_year as end_year + FROM current_actors ca + JOIN latest_scd ls ON ls.actor_id = ca.actor_id + WHERE ca.quality_class = ls.quality_class + AND ca.is_active = ls.is_active +), +changed_records AS ( + SELECT + ca.actor_id, + UNNEST(ARRAY[ + ROW( + ls.quality_class, + ls.is_active, + EXTRACT(YEAR FROM ls.start_date)::INTEGER, + EXTRACT(YEAR FROM ls.end_date)::INTEGER + )::actor_scd_type, + ROW( + ca.quality_class, + ca.is_active, + ca.curr_year, + ca.curr_year + )::actor_scd_type + ]) as records + FROM current_actors ca + LEFT JOIN latest_scd ls ON ls.actor_id = ca.actor_id + WHERE ca.quality_class != ls.quality_class + OR ca.is_active != ls.is_active +), +unnested_changed_records AS ( + SELECT + actor_id, + (records::actor_scd_type).quality_class, + (records::actor_scd_type).is_active, + (records::actor_scd_type).start_year, + (records::actor_scd_type).end_year + FROM changed_records +), +new_records AS ( + SELECT + ca.actor_id, + ca.quality_class, + ca.is_active, + ca.curr_year as start_year, + ca.curr_year as end_year + FROM current_actors ca + LEFT JOIN latest_scd ls ON ca.actor_id = ls.actor_id + WHERE ls.actor_id IS NULL +) +INSERT INTO actors_history_scd ( + actor_id, + quality_class, + is_active, + start_date, + end_date +) +SELECT + actor_id, + quality_class, + is_active, + make_timestamp(start_year, 1, 1, 0, 0, 0)::timestamptz as start_date, + CASE + WHEN end_year IS NOT NULL THEN make_timestamp(end_year, 12, 31, 23, 59, 59)::timestamptz + ELSE NULL + END as end_date +FROM ( + SELECT * FROM historical_scd + UNION ALL + SELECT * FROM unchanged_records + UNION ALL + SELECT * FROM unnested_changed_records + UNION ALL + SELECT * FROM new_records +) combined; \ No newline at end of file diff --git a/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py new file mode 100644 index 000000000..166f1d07c --- /dev/null +++ b/bootcamp/materials/1-dimensional-data-modeling/homework-answers/tests/test_1_dimensional_data_modeling.py @@ -0,0 +1,174 @@ +import pytest +from sqlalchemy import create_engine, text +from datetime import date +import json + +@pytest.fixture(scope="session") +def db_engine(): + return create_engine('postgresql://postgres:postgres@localhost:5432/postgres') + +def test_1_validate_actors_ddl(): + expected_ddl = """ + CREATE TYPE film_struct AS ( + film VARCHAR(255), + votes INTEGER, + rating DECIMAL(3,1), + filmid UUID + ); + + CREATE TYPE quality_class_enum AS ENUM ('star', 'good', 'average', 'bad'); + + CREATE TABLE actors ( + id UUID PRIMARY KEY, + name VARCHAR(255) NOT NULL, + films film_struct[] NOT NULL, + quality_class quality_class_enum, + is_active BOOLEAN NOT NULL + ) + """ + assert expected_ddl.strip() == "YOUR_DDL_HERE".strip() + + +def test_2_cumulative_table_generation(db_engine): + with db_engine.connect() as conn: + test_data = [ + (1, 'Actor1', 2022, json.dumps([ + {"film": "Film1", "votes": 1000, "rating": 8.5, "filmid": "f1", "year": 2022}, + {"film": "Film2", "votes": 1000, "rating": 7.5, "filmid": "f2", "year": 2021} + ])), + (2, 'Actor2', 2022, json.dumps([ + {"film": "Film3", "votes": 1000, "rating": 6.5, "filmid": "f3", "year": 2022}, + {"film": "Film4", "votes": 1000, "rating": 5.5, "filmid": "f4", "year": 2021} + ])) + ] + + for id, name, year, films in test_data: + conn.execute( + text("INSERT INTO actors (id, name, created_year, films) VALUES (:id, :name, :year, :films::jsonb)"), + {"id": id, "name": name, "year": year, "films": films} + ) + conn.commit() + + cumulative_query = """ + WITH yearly_stats AS ( + SELECT + a.id, + film_data->>'year' as year, + AVG((film_data->>'rating')::float) as avg_rating, + bool_or((film_data->>'year')::int = EXTRACT(YEAR FROM CURRENT_DATE)) as is_active + FROM actors a, + jsonb_array_elements(films) as film_data + WHERE (film_data->>'year')::int <= :target_year + GROUP BY a.id, film_data->>'year' + ) + UPDATE actors a + SET quality_class = + CASE + WHEN ys.avg_rating > 8 THEN 'star' + WHEN ys.avg_rating > 7 THEN 'good' + WHEN ys.avg_rating > 6 THEN 'average' + ELSE 'bad' + END, + is_active = ys.is_active + FROM yearly_stats ys + WHERE a.id = ys.id + AND ys.year = :target_year + """ + + conn.execute(text(cumulative_query), {"target_year": 2022}) + conn.commit() + + result = conn.execute(text("SELECT id, quality_class, is_active FROM actors")) + actor_stats = {row[0]: (row[1], row[2]) for row in result} + + assert actor_stats[1][0] == 'star' + assert actor_stats[1][1] is True + assert actor_stats[2][0] == 'average' + assert actor_stats[2][1] is True + + +def test_3_validate_scd_ddl(): + expected_ddl = """ + CREATE TABLE actors_history_scd ( + actor_id BIGINT REFERENCES actors(id), + quality_class VARCHAR(10) CHECK (quality_class IN ('star', 'good', 'average', 'bad')), + is_active BOOLEAN NOT NULL, + start_date DATE NOT NULL, + end_date DATE, + is_current BOOLEAN NOT NULL, + CONSTRAINT valid_dates CHECK (end_date IS NULL OR end_date >= start_date) + ) + """ + assert expected_ddl.strip() == "YOUR_SCD_DDL_HERE".strip() + +def test_4_scd_backfill(db_engine): + with db_engine.connect() as conn: + backfill_query = """ + INSERT INTO actors_history_scd ( + actor_id, quality_class, is_active, start_date, end_date, is_current + ) + SELECT DISTINCT ON (a.id) + a.id, + a.quality_class, + a.is_active, + make_date(a.created_year, 1, 1) as start_date, + NULL as end_date, + TRUE as is_current + FROM actors a + ORDER BY a.id, a.created_year DESC + """ + + conn.execute(text(backfill_query)) + conn.commit() + + result = conn.execute(text("SELECT COUNT(*) FROM actors_history_scd")) + assert result.scalar() == 2 + +def test_5_scd_incremental_update(db_engine): + with db_engine.connect() as conn: + incremental_query = """ + WITH updates AS ( + SELECT + a.id as actor_id, + a.quality_class, + a.is_active, + CURRENT_DATE as start_date + FROM actors a + JOIN actors_history_scd h + ON a.id = h.actor_id + AND h.is_current = TRUE + WHERE h.quality_class != a.quality_class + OR h.is_active != a.is_active + ) + INSERT INTO actors_history_scd ( + actor_id, quality_class, is_active, start_date, end_date, is_current + ) + SELECT + u.actor_id, + u.quality_class, + u.is_active, + u.start_date, + NULL, + TRUE + FROM updates u + """ + + # Update test data + conn.execute(text(""" + UPDATE actors + SET quality_class = 'good', is_active = false + WHERE id = 1 + """)) + + conn.execute(text(incremental_query)) + conn.commit() + + result = conn.execute(text(""" + SELECT COUNT(*) + FROM actors_history_scd + WHERE actor_id = 1 + """)) + assert result.scalar() == 2 + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file diff --git a/bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..312dfe4ffa5bd8977c9e6fe5055cb959f29ed45f GIT binary patch literal 196 zcmYk$u?@m75Cu>Nr9em-gu+}wix3mgFo1Q$M`9BDEZae12r8yv83v$b3o7gizI5+& zzdCN3g~+)*?_JsN!hfoK?DE7qqs2iitNlZSQvdjL(6c4bbfJ+w1@M8-)TRg;6H0QO zaYRA(==70!fJ|!9da#%%7#t(4W%L*Zd77L?OD>f%h8C1E&LpK){`#74EbD2cZFwNh HGPU>uB?UCk literal 0 HcmV?d00001 diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3048d13ec785f2d26e8619b8ee5c0dda26d01d5f GIT binary patch literal 201 zcmYk$u?@m75Cu>Nr9em-gu+}wix3mgFo5O6N8-fxS+;}35L8UTG7Lb=7F5_3eCgil zes$C|E0%M6-utrOh5uCfIOK_SMzIrHSH}knrT+0TFpx!1ctJxR1NcBAb}53ANpf_( zc8Gxw(y5kcf=ue9^AIwCGYPW9VR{Hi8h^=_JI)Uti0eB{dt| KmoMTXQ?oBdBQ~)B literal 0 HcmV?d00001 diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/monthly_user_site_hits_job.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24a6e6a1b9b1ca38271f840798080ebb88500648 GIT binary patch literal 1191 zcma)5OK%e~5VrS`q-iS+4Fe}*66D<^&eNJxxbR4UO*ox~H5$N6SF^9`S#t|E|+-#+z(i_mvtTown6 z$FO7oiXw_rl;IA>h&t5$fI1HKs1Ls`Ewxdt`~z;G8rD0jR}(RT=gTVh)@fc z&)d73^NrTpMzh^$&i4})5h2L%&C=V2V5wFMhz#;U7J)-X-9$(gQ8K&p#?*qi18m0I(&tS&@>M?XawPnXqH+%EPqgXIsNSx`Jd(J5*}_aKOayD-wo9zutv?p0SX!BbM@liLelSpT$as=3jM|_mZ43QSXzS z?s0w)f>JTbiZFxFA|a^^SL-%Vy*tP$M9Gyolp+rI`JN1~PCo+QVlmY7ltHV5mzglHNeENtZgb3?Pxv-sf%SB*B+EpS;DoM8p zke;?{Px}MTaern%fY+V+4~hZ>_DDO46=VZ8V9y{$z4!d_z3-7v#f=+20{QsIZ(sB7ep$->2WL-6^b*`9+H+%K!%-OXLm%(jGk~2=XA<9t=`x`l9>=Z zSFz+&7J0=alR~8VBu+9aWyR8KKW|m7WIG`JSDnEsw1IEEY!61=QMVtFCqJ(SLHdKI zoqxz5(r|aB#&aCtGd0+6tF73|;{r#8bN^EcbxcOj7iNAPWbB(Ll zWOFq>Qr+AVeo3nDC#D0Pi&@uT?81;YVLHWg=t*Z7qf<1;$~nW(ft*M;7DrSDHGMmQ zZlic#3^*tgKH}5zRm$J%ntYpS@9m6>lfcoQG~G(C0cmtBp!fWa#2W1IMTT#RDGwQc z$Frj3A}mF5#1kb$NgkJiC$sXD%^0AM_IxG?x7h*_) zR=yx<2Nnt);kjdIn%5XmGM>H0j%v3p3F&o=@EGF~Og1^u-4`C(0fyu>n__^5m z`mK8li#wo-0uQ1-vE1K-YTFn^9V|8i>$3uC$}KE%a|fcCW&9Bam2ASafn2Bg{f74! D{r2w( literal 0 HcmV?d00001 diff --git a/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc b/bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/team_vertex_job.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdd3e52d03f3501108047cecb18eabd69ac843ac GIT binary patch literal 1186 zcmYjR&2H2%5Vo_){)B=!paKU_4oGcK+X!(5L@5ib1iL7^P|-{9#TOPmClX)holAiNhgCnN=nGN(u4(-QMU~ zpnw%J;lQUg@L+owtos4BVfT4=XB6Dt2;#wR(tq{hQ8!)>gTZSYUT(*u-l#X|2an#^ z_mhkVgZObbUIM~^$v{*mNjWVnCj5wbzdDBeI&|`)rlnE%A zp)}(XJ_9C_Ni{7|G<5H54@c_;`wN+LhoiXHe;V3DnEgWo)}pFGs0^h4UyF0){oQBX zxO*;Ga6h=?`>Cn-c{6jo6J!Uwyn^PG&hZAEQA$q9oGRyxV!fUIo@Xl>HuTk008=m! zG1NIrVFX3}I)fA4kVm=pj;0`Hp`$%%Zien~rr1QFM&F2NBmXZMy{ig{I2=Q6GLP^8 z2aqZmnb1ur_p=f}Y`$Y9A6M03ghr*TsG|b66&cHAba&ITZ%$Axq%M^KrAVWLYAhpL z`eW1t-X{ob)iZsASIIK01S{pF5=e;!zHSchqVcIuh51-g?Ejk4$ir?mx`z`*4(Qx5 zD$N@l6`H$e-`U2`V10fTc za^3p7(47iXKM|RN&=+QF-5O7`oP*HKf=QG>w^&{G@!a~tKB$4%u?4YtYfrIp4%)K> z+AS&$+RsXfYV=A~g^p+%%1$*^^;9K%VvDjc9piBemX$YgCNH6JX%iopI=?Twt5jS= ws2_Tm`OFqz3rlWu6h3vV{ z6{qt}vk+Og=e;ZWF8EiK%Pu?B8Co2~vf4jHEb|W^3%$1lnl3c@KoLUpXKFctMn*}l zGma?A9-TgVKR^bzXaiVeiU!97YncMZL7pb3(URjbrq}{g%7>IP&ahIeaDB};*6V4c LZFwWjGHUS!hFmu{ literal 0 HcmV?d00001 diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/conftest.cpython-310-pytest-8.3.4.pyc new file mode 100644 index 0000000000000000000000000000000000000000..046790b76730c36e243eef22acb14fcd0a5c0ab3 GIT binary patch literal 600 zcmYjOy^a$x5VrTnyZJdvBwD%@$riK-opcgqDAKu!eCtWFiM=+q1EhxbDqi5Oymvu7 zfLp42g_DpNZ{WC*JpcR6H?!vAVoXqOetzHnK>I5N&n3s;5}$vMN)kyG;dHQ>&ND)Z zE_g}L6~-A>GF9oC*OUO;k7O>38!Ai5zLBq#k0o7^S@nQeWI3a8y0XF@R^WYS`dIn1 zgN4$!#zLF=o=akLh0ni470906W|H2~+gxU-i#vko`7)Nuv_f&#?mBP9taKT+$Ljjn zDG81&zb7;2SA|m_xtq zdH^@yiC$hCb7)Y7R%qL3gmRrwzIii84(>4D9D9kw`jC>|wM_tzK&|OF$y?YHZk;-d zv|#!xMaC$j1wTPF+SB?edbPHv0YrEYF5od24nDgsKpZ`#V)ZXK z#*b6`TDZv0E}z)bFn!@as=xQzbYH^J!7%*)QO;hCgkt?x8+laljwZ?}^_ooSID0|= E0JjmQTmS$7 literal 0 HcmV?d00001 diff --git a/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc b/bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_monthly_user_site_hits.cpython-310-pytest-8.3.4.pyc new file mode 100644 index 0000000000000000000000000000000000000000..feae115187e05449f8f89cf39578ebc47424516c GIT binary patch literal 1363 zcmZ`(%Z}496t$hCkF?YBkdR=*2(h3@ZDEEbLLeSu140^B-b9|*Z8PMB?aa_oDj~sk zHt++$j-TNJvSl?3K7g0Rwc80DA=2jd`uN^!``qJZ8yh}?=jhA(>CXm2U(I0sabR#4 zUPZxh#Bq!$w!b8C2*N0FsfW!Bi5t%mceoojlNN1R%!}KJPkoGp&l|k?hR_apE#3yN z%YEK?i|7W&J7~~7hWhB~09W|>F##96Ydi~*EY-7k9u!K*Kt);vvq-BzOO~ohCKINk zEQMG6ZA7v+%S7(M>})_m5l{5#F&M z7){BdVY3>L)zoh3mCzp3G}DZyhyK=$t?l6r_})D6Z-GB>q&dv7HAWi)r|PVcMB6pL zw0D|dNHyn{CQG)#RA>np{}{A~2Cg(>q~~f{Yo_V>8v9H2T42q!-QK{Hre4(#rGO?s zV46LWpbzN+Dw}oq1$uZv30JhAii4&4mcpTal;(wLyJobl zvO=3-yOSuW%_m)Ra< zsj;a%6zMcdg^YhAxrWxXkEyN#cI4gwEG-Rij>2#Qf)vb!?6|$3K{#K zi!;jS)rC{@^_?=*t7l+b>|){&+`t68zlm3S4FZV8YR~=VU+UWE)!Z+#nuEAs?svb} z!}2T?9rUW!ZV|;gN>$apo6m!KbTP>1W=o@GOW-b!q&Sdpd8>8}Rm~I3hh%#KCh5g+ z76BC_L*2x5I0&-@S|}vFyqE2&k(K7Ze;+VswNQ1!ERKPFSX4<5_GyJl#dPs)x3FapUL)y?=CmupX*M-dLf-OL0nv2XRR_TdUB{R9K@I0zm3Bt_l0-E@l zb36p5WwpF{7G@m=e6AE!Ce!Y0pt)3eHO+Xzv~Id*aa=4%+d=<%4YXm(WP%wOKqVaD z)E0cUTKI0`zK>hKMPZOB;R%1^I}}`?P_&$_EgYH#Yq}yI>56`$YtMKqvhp_G+BZJN z`%c#le!qn(nzWX}XLX^u?gG4{vJV1{{;D>HSkDpNZPRPy8~; z3~NdRyn;g3ppsh2V~o*>%{zi>npXg7G-H)GlJZ4@!f;mB362M-oE18`J#rO|Ci6;Q z1C_Bxt$1QUV3)eD<>e99Y6?^wZWl^Z}-9F=Hl+UK?igIys^HG6!P zV;5uF^~7F7#$|~-gG#R+%OgEL9a`=F5Hz@3YI_hxFK&=>zcElV71l+YA}mak$vo;c#X+`*Eno{N(nC zaq=8r!%zq!XiXRm->8WrLP?Xbl=_vZ?i~{mi=^&112zb@v>rAY%P4_N^hEz6Vk6WB zVu)HUGBJ8b*jUgbGRdznKY1~sj(&R;p^Dl=p-Kz9DbGM#cw1V{TeDEQ;Z~{^9kP}; zAndBE!DYwj`7$_ezLo-7D;E-UVb>iLomL$#NiZ()ew(KuuM`EJ&o!LMkg;fGb?Q?V zm9*=EYiN0~2Cf%s)d~<5^)>EB55Gfv5Aih}3X5<=vd_eiCzS2UWveul?W(T%yoO1tu@K_+ zDh=GiPuy~>xB-nVNVG@m_PH^5WTjY?@au}#xc#K(QqysLiW|1w3fC|WbKmF-pTiA# zWibFF*&e*@pt2xJwXzuDMmGkr_=D(u-0AQ2>_jy%6L1E#Z<86;{u-mq%;!IVb}3r` zeQ?5CF<0tzhQe07>1No7$duP+_T(TGcCf%k!5UiYjYd~93kDN;`9&+gR`Y4Mc3*{s zU3)K~1sxjEUrF-~io4NWn*5ig8I7Wd;`2$Omi&|aoBW@R_o=>*j*~lXFki{q%GS92 zdABZaW)I44?YEhJ0*W_hs^N{6){_d1(d7EqwhV(qRb0CV)rrLantBd_ZCR-X4*)c~ zkAGmM;TG-6M?*M1H>y-!BR@=Qbc)4=0}TfrY7NwHiF@ec2H!VItZMkdx3tMoJR*6N K(=5tp9)AQhTV`nh literal 0 HcmV?d00001