Update databases.html
· 1 year ago
206b4ba759c98066cc640efa7e00a6962b70df5f
Parent:
28a8d3123
1 file changed +539 −166
- databases.html +539 −166
Diff
--- a/databases.html +++ b/databases.html @@ -3,31 +3,31 @@ <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> - <title>Databases Cheatsheet - Interactive database diagram cheatsheet comparing SQL (Postgres, MySQL, SQL Server, SQLite) vs NoSQL (MongoDB, Redis, Cassandra, Neo4j) vs Search (Elasticsearch) on philosophies, tradeoffs, scalability, and use cases</title> + <title>Databases Cheatsheet - Interactive database diagram cheatsheet comparing SQL (Postgres, MySQL, SQL Server, SQLite) vs NoSQL (MongoDB, Redis, Cassandra, Neo4j) vs Search (Elasticsearch) vs Modern (Timescale, Clickhouse, DuckDB, Vector) on philosophies, tradeoffs, scalability, and use cases</title> <!-- SEO Meta Description --> - <meta name="description" content="Interactive database diagram cheatsheet comparing SQL (Postgres, MySQL, SQL Server, SQLite) vs NoSQL (MongoDB, Redis, Cassandra, Neo4j) vs Search (Elasticsearch) on philosophies, tradeoffs, scalability, and use cases."> + <meta name="description" content="Interactive database diagram cheatsheet comparing SQL, NoSQL, Search, and Modern databases (Postgres, MySQL, MongoDB, Redis, Cassandra, Neo4j, Elasticsearch, TimescaleDB, ClickHouse, DuckDB, Vector DBs) on philosophies, tradeoffs, scalability, index types, and use cases for senior architects."> <!-- Canonical URL --> <link rel="canonical" href="http://cheatsheets.davidveksler.com/databases.html"> <!-- Social Media Metadata --> <!-- Open Graph / Facebook --> - <meta property="og:title" content="Database Diagram Cheatsheet - Philosophies & Tradeoffs"> - <meta property="og:description" content="Interactive visual guide comparing SQL vs NoSQL vs Search databases (Postgres, MySQL, Mongo, Redis, Cassandra, Neo4j, Elasticsearch) on design, tradeoffs & use cases."> + <meta property="og:title" content="Comprehensive Database Diagram Cheatsheet - Philosophies & Tradeoffs"> + <meta property="og:description" content="Interactive visual guide comparing SQL vs NoSQL vs Search vs Modern databases (Postgres, MySQL, Mongo, Redis, Cassandra, Neo4j, Elasticsearch, Timescale, ClickHouse, DuckDB, Vector) on design, tradeoffs, indexes & use cases."> <meta property="og:type" content="article"> - <meta property="og:url" content="https://cheatsheets.davidveksler.com/databases.html"> - <meta property="og:image" content="https://cheatsheets.davidveksler.com/images/databases.png"> - <meta property="og:image:alt" content="Diagram showing connections between database categories and specific database examples like PostgreSQL, MongoDB, Redis."> + <meta property="og:url" content="https://cheatsheets.davidveksler.com/databases.html"> + <meta property="og:image" content="https://cheatsheets.davidveksler.com/images/databases.png"> <!-- Consider updating image --> + <meta property="og:image:alt" content="Diagram showing connections between database categories like SQL, NoSQL, Modern, and specific database examples like PostgreSQL, MongoDB, Redis, TimescaleDB."> <!-- Optional: og:site_name --> <!-- <meta property="og:site_name" content="David Veksler Cheatsheets"> --> <!-- Twitter Card --> <meta name="twitter:card" content="summary_large_image"> - <meta name="twitter:title" content="Database Diagram Cheatsheet - Philosophies & Tradeoffs"> - <meta name="twitter:description" content="Interactive visual guide comparing SQL vs NoSQL vs Search databases (Postgres, MySQL, Mongo, Redis, Cassandra, Neo4j, Elasticsearch) on design, tradeoffs & use cases."> - <meta name="twitter:image" content="https://cheatsheets.davidveksler.com/images/databases.png"> - <meta name="twitter:image:alt" content="Diagram showing connections between database categories and specific database examples like PostgreSQL, MongoDB, Redis."> + <meta name="twitter:title" content="Comprehensive Database Diagram Cheatsheet - Philosophies & Tradeoffs"> + <meta name="twitter:description" content="Interactive visual guide comparing SQL vs NoSQL vs Search vs Modern databases (Postgres, MySQL, Mongo, Redis, Cassandra, Neo4j, Elasticsearch, Timescale, ClickHouse, DuckDB, Vector) on design, tradeoffs, indexes & use cases."> + <meta name="twitter:image" content="https://cheatsheets.davidveksler.com/images/databases.png"> <!-- Consider updating image --> + <meta name="twitter:image:alt" content="Diagram showing connections between database categories like SQL, NoSQL, Modern, and specific database examples like PostgreSQL, MongoDB, Redis, TimescaleDB."> <!-- Optional: twitter:site and twitter:creator --> <!-- <meta name="twitter:site" content="@YourTwitterHandle"> --> <!-- <meta name="twitter:creator" content="@DavidVekslerTwitterHandle"> --> @@ -58,15 +58,19 @@ --schema-border-color: #a5b4fc; /* --- Database Category Colors --- */ - --db-color-sql: #3b82f6; - --db-color-nosql-doc: #10b981; - --db-color-nosql-kv: #f97316; - --db-color-nosql-wc: #ef4444; - --db-color-nosql-graph: #8b5cf6; - --db-color-search: #f59e0b; - --db-color-concept: #6b7280; - --db-color-tradeoff: #64748b; - --db-color-choosing: #4f46e5; + --db-color-sql: #3b82f6; /* Blue */ + --db-color-nosql-doc: #10b981; /* Green */ + --db-color-nosql-kv: #f97316; /* Orange */ + --db-color-nosql-wc: #ef4444; /* Red */ + --db-color-nosql-graph: #8b5cf6; /* Purple */ + --db-color-search: #f59e0b; /* Amber */ + --db-color-concept: #6b7280; /* Gray */ + --db-color-tradeoff: #64748b; /* Slate */ + --db-color-choosing: #4f46e5; /* Indigo */ + + /* --- NEW COLORS --- */ + --db-color-modern: #0ea5e9; /* Cyan-Blue */ + --db-color-index: #14b8a6; /* Teal */ --db-category-color: var(--db-color-concept); /* Default */ } @@ -212,6 +216,9 @@ padding: 0.3rem 0.6rem; color: var(--db-category-color); border: 1px solid var(--db-category-color); background-color: transparent; transition: background-color 0.2s ease, color 0.2s ease; + display: inline-flex; /* Ensure icon aligns properly */ + align-items: center; /* Vertically center icon */ + gap: 0.3em; /* Space between text and icon */ } .details-toggle:hover { background-color: var(--db-category-color); color: white; } .details-toggle .bi { transition: transform 0.2s ease-in-out; } @@ -221,25 +228,30 @@ .leader-line { pointer-events: none; z-index: 20; transition: opacity 0.3s ease-in-out; } - /* Color Coding Class Assignments */ + /* --- Color Coding Class Assignments --- */ .cat-concept, .db-type-concept { --db-category-color: var(--db-color-concept); } .cat-sql, .db-type-sql { --db-category-color: var(--db-color-sql); } .cat-nosql .db-type-nosql-doc { --db-category-color: var(--db-color-nosql-doc); } .cat-nosql .db-type-nosql-kv { --db-category-color: var(--db-color-nosql-kv); } .cat-nosql .db-type-nosql-wc { --db-category-color: var(--db-color-nosql-wc); } .cat-nosql .db-type-nosql-graph { --db-category-color: var(--db-color-nosql-graph); } - .cat-nosql .db-type-search { --db-category-color: var(--db-color-search); } + /* Assign search color directly */ + .cat-nosql .db-type-search, .db-type-search { --db-category-color: var(--db-color-search); } /* Give NoSQL container a default color if needed, maybe doc green? */ .cat-nosql { --db-category-color: var(--db-color-nosql-doc); } .cat-tradeoff, .db-type-tradeoff { --db-category-color: var(--db-color-tradeoff); } .cat-choosing, .db-type-choosing { --db-category-color: var(--db-color-choosing); } + /* --- NEW CATEGORY STYLES --- */ + .cat-modern, .db-type-modern { --db-category-color: var(--db-color-modern); } + .cat-index, .db-type-index { --db-category-color: var(--db-color-index); } + </style> </head> <body> <header class="page-header"> <h1 class="display-5"><i class="bi bi-diagram-3"></i> Databases Cheatsheet</h1> - <p class="lead">An interactive visual guide to database philosophies, tradeoffs, and use cases.</p> + <p class="lead">An interactive visual guide to database philosophies, tradeoffs, and use cases for architects.</p> </header> <div class="container" id="main-container"> @@ -398,6 +410,28 @@ </div> </div> </div> + <!-- NEW: MVCC vs 2PL Card --> + <div class="col-lg-4 col-md-6"> + <div class="info-card db-type-concept" id="card-cc"> + <div class="card-body"> + <h5><i class="bi bi-arrows-collapse"></i> MVCC vs 2PL</h5> + <div class="card-content-wrapper"> + <p class="summary">Row snapshots (<span class="term">MVCC</span>) keep readers lock-free; <span class="term">2-Phase Locking</span> guarantees serial order but risks deadlocks.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseCC" aria-expanded="false" aria-controls="collapseCC"> + Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseCC"> + <h6>Concurrency Control Mechanisms</h6> + <ul> + <li><strong>Multi-Version Concurrency Control (MVCC):</strong> Maintains multiple versions of rows. Readers access a consistent snapshot without blocking writers, and writers don't block readers. Reduces lock contention but requires cleanup (vacuuming) of old versions. <br/><em>Examples: PostgreSQL, MySQL (InnoDB), Oracle.</em></li> + <li><strong>Two-Phase Locking (2PL):</strong> Acquires locks during transaction execution (growing phase) and releases them only after commit/abort (shrinking phase). Prevents conflicts but can lead to deadlocks and reduced concurrency as transactions block each other. Pessimistic (locks before access) or Optimistic variants exist. <br/><em>Examples: SQL Server (pessimistic default), DB2.</em></li> + <li><strong>Optimistic Concurrency Control (OCC):</strong> Assumes conflicts are rare. Transactions proceed without locks and validate against conflicts only at commit time. High throughput if conflicts are infrequent, but high abort rates otherwise. <br/><em>Examples: Used in some distributed systems like Spanner/Calvin, some configurations of other DBs.</em></li> + </ul> + </div> + </div> + </div> + <!-- END NEW CARD --> </div> <!-- /.row --> </div> <!-- /.schema-container --> @@ -437,6 +471,7 @@ <li>Can have higher resource usage per connection (process-based model historically).</li> <li>Potentially steeper learning curve for advanced features compared to simpler databases.</li> <li>Horizontal scaling configuration can be complex.</li> + <li>Vacuuming overhead for MVCC management.</li> </ul> <h6>Use Cases</h6> <p>Complex applications, data warehousing, geospatial applications, systems needing strong data integrity (finance), general-purpose OLTP/OLAP where advanced features are valuable.</p> @@ -458,7 +493,7 @@ <h6>Philosophy & Core</h6> <p>Emphasis on ease of use, speed (particularly read-heavy workloads), reliability, and being a versatile, widely adopted database, especially in web development (LAMP/LEMP stacks). Offers flexibility via pluggable storage engines (e.g., InnoDB, MyISAM).</p> <h6>Schema & Consistency</h6> - <p>Schema-on-Write. ACID compliance primarily provided by the default InnoDB storage engine. Older/other engines like MyISAM may trade ACID guarantees for performance or specific features (e.g., full-text search historically).</p> + <p>Schema-on-Write. ACID compliance primarily provided by the default InnoDB storage engine (uses MVCC). Older/other engines like MyISAM may trade ACID guarantees for performance or specific features (e.g., full-text search historically).</p> <h6>Scalability</h6> <p>Primarily Vertical Scaling. Strong built-in asynchronous/semi-synchronous Replication capabilities for read scaling (Horizontal). Group Replication and InnoDB Cluster provide more advanced HA/write scaling options. Third-party solutions like Vitess offer large-scale sharding.</p> <h6>Strengths</h6> @@ -495,7 +530,7 @@ <h6>Philosophy & Core</h6> <p>A comprehensive enterprise data platform offering robust performance, security, and a wide array of integrated services (Business Intelligence - SSAS, ETL - SSIS, Reporting - SSRS). Deeply integrated with Microsoft products and Azure cloud.</p> <h6>Schema & Consistency</h6> - <p>Schema-on-Write with strong ACID compliance.</p> + <p>Schema-on-Write with strong ACID compliance. Primarily uses pessimistic locking (2PL) by default, though optimistic (snapshot isolation via row versioning, similar to MVCC) is available.</p> <h6>Scalability</h6> <p>Strong Vertical Scaling. Horizontal scaling achieved through features like Replication, Log Shipping, and sophisticated Always On Availability Groups (providing HA/DR and read scaling). Built-in table partitioning.</p> <h6>Strengths</h6> @@ -512,6 +547,7 @@ <li>Licensing costs can be substantial, especially for Enterprise edition features.</li> <li>Historically Windows-centric, though cross-platform support is mature now.</li> <li>Can feel somewhat vendor-locked into the Microsoft ecosystem.</li> + <li>Default locking model can lead to higher blocking/deadlocks under high concurrency compared to MVCC.</li> </ul> <h6>Use Cases</h6> <p>Enterprise applications (ERP, CRM), business intelligence and data warehousing, applications built on the Microsoft stack (.NET), systems requiring high levels of security and built-in analytics/reporting.</p> @@ -533,9 +569,9 @@ <h6>Philosophy & Core</h6> <p>Simplicity, portability, reliability. Provides a full-featured SQL database engine as a library linked directly into an application. The entire database (schema, tables, data) is stored in a single cross-platform file. No separate server process.</p> <h6>Schema & Consistency</h6> - <p>Schema-on-Write, but uses dynamic typing ('manifest typing') - you can store any value type in almost any column. Fully ACID compliant; transactions are serializable by default, ensuring strong consistency for single-user or low-concurrency scenarios.</p> + <p>Schema-on-Write, but uses dynamic typing ('manifest typing') - you can store any value type in almost any column. Fully ACID compliant; transactions are serializable by default, ensuring strong consistency for single-user or low-concurrency scenarios using file-level locking.</p> <h6>Scalability</h6> - <p>Limited to the resources of the single host machine. Not designed for high concurrency, especially write concurrency (default mode allows only one writer at a time). Not suitable as a backend for client-server applications with many concurrent users.</p> + <p>Limited to the resources of the single host machine. Not designed for high concurrency, especially write concurrency (default mode allows only one writer at a time via file locking). WAL mode improves read/write concurrency somewhat. Not suitable as a backend for client-server applications with many concurrent users.</p> <h6>Strengths</h6> <ul> <li>Extremely lightweight and fast for single-user access.</li> @@ -543,7 +579,7 @@ <li>Fully ACID transactional guarantees.</li> <li>Database contained in a single portable file.</li> <li>Full SQL implementation (most standard features).</li> - <li>Excellent for read-heavy workloads from multiple processes.</li> + <li>Excellent for read-heavy workloads from multiple processes (esp. with WAL).</li> </ul> <h6>Tradeoffs</h6> <ul> @@ -553,7 +589,7 @@ <li>Lacks user management and granular permissions found in server databases.</li> </ul> <h6>Use Cases</h6> - <p>Mobile applications (Android, iOS), desktop applications, embedded systems, application file formats, browser data storage (historically Web SQL), testing environments, websites with low write concurrency.</p> + <p>Mobile applications (Android, iOS), desktop applications, embedded systems, application file formats, browser data storage (historically Web SQL), testing environments, websites with low write concurrency, data interchange format.</p> </div> </div> </div> @@ -579,7 +615,7 @@ <h6>Philosophy & Core</h6> <p>Store data in flexible, JSON-like documents (BSON format). Designed for ease of development (documents map well to objects in code), flexible schemas, and horizontal scalability.</p> <h6>Schema & Consistency</h6> - <p>Schema-on-Read (dynamic schema). Optional schema validation can be enforced. Consistency is tunable per operation using Read/Write Concerns. Can operate as CP or AP depending on configuration. Supports multi-document ACID transactions in recent versions (with performance considerations).</p> + <p>Schema-on-Read (dynamic schema). Optional schema validation can be enforced. Consistency is tunable per operation using Read/Write Concerns. Can operate as CP or AP depending on configuration and read preferences. Supports multi-document ACID transactions in recent versions (with performance considerations).</p> <h6>Scalability</h6> <p>Excellent Horizontal Scalability via built-in sharding (distributing data across multiple servers/clusters). Read scaling achieved through replica sets (providing redundancy and failover).</p> <h6>Strengths</h6> @@ -618,9 +654,9 @@ <h6>Philosophy & Core</h6> <p>Simplicity and performance. Primarily an in-memory data structure server, used as a key-value store, cache, message broker, and more. Maps keys to various value types (strings, lists, sets, sorted sets, hashes, bitmaps, streams).</p> <h6>Schema & Consistency</h6> - <p>Schema-less (value structure is defined by application). Single-node Redis offers strong consistency (atomic operations). Redis Cluster prioritizes Availability (AP) during partitions, though efforts are made to maintain consistency.</p> + <p>Schema-less (value structure is defined by application). Single-node Redis offers strong consistency (atomic operations). Redis Cluster prioritizes Availability (AP) during partitions, though efforts are made to maintain consistency (data might be stale until partition heals).</p> <h6>Scalability</h6> - <p>Primarily Vertical Scaling for a single instance. Redis Cluster provides Horizontal Scaling (sharding data across nodes). Replication (master-slave) used for read scaling and HA.</p> + <p>Primarily Vertical Scaling for a single instance. Redis Cluster provides Horizontal Scaling (sharding data across nodes). Replication (master-replica) used for read scaling and HA.</p> <h6>Strengths</h6> <ul> <li>Blazing fast performance (mostly in-memory operations).</li> @@ -636,6 +672,7 @@ <li>Querying is limited to key lookups or operations on specific data structures (no complex ad-hoc querying).</li> <li>Cluster management adds complexity.</li> <li>Persistence options have performance tradeoffs.</li> + <li>Eventual consistency across replicas/cluster nodes.</li> </ul> <h6>Use Cases</h6> <p>Caching (database query results, web pages), session management, real-time leaderboards/counters, rate limiting, message queues (Pub/Sub, Streams), geospatial indexing, fast data ingest buffer.</p> @@ -655,14 +692,14 @@ </div> <div class="collapse collapse-content" id="collapseNoSQL3"> <h6>Philosophy & Core</h6> - <p>Handle huge datasets across many commodity servers with no single point of failure. Prioritizes availability and partition tolerance (AP system). Masterless architecture ensures high availability and excellent write performance.</p> + <p>Handle huge datasets across many commodity servers with no single point of failure. Prioritizes availability and partition tolerance (AP system). Masterless architecture ensures high availability and excellent write performance via LSM-Tree storage engine.</p> <h6>Schema & Consistency</h6> <p>Schema uses Keyspaces (like DBs), Tables, Rows, and Columns. Rows identified by a primary key can have varying sets of columns (wide-column aspect). Consistency is tunable per-query (from `ONE` to `ALL`), defaulting to eventual consistency. Designed for Availability.</p> <h6>Scalability</h6> <p>Excellent linear Horizontal Scalability for both reads and writes (especially writes). Adding nodes increases capacity and throughput proportionally. Designed for multi-datacenter deployments.</p> <h6>Strengths</h6> <ul> - <li>Massive write throughput.</li> + <li>Massive write throughput (leveraging LSM Trees).</li> <li>High availability and fault tolerance (no single point of failure).</li> <li>Linear scalability by adding nodes.</li> <li>Tunable consistency levels per operation.</li> @@ -671,9 +708,9 @@ <h6>Tradeoffs</h6> <ul> <li>Eventual consistency requires careful application design.</li> - <li>Reads can be slower than writes, especially with higher consistency levels.</li> + <li>Reads can be slower than writes, especially with higher consistency levels or data spread across nodes.</li> <li>Query language (CQL) is SQL-like but lacks JOINs, group by aggregates (limited), and complex transactions. Data modeling requires query-first thinking.</li> - <li>Operational complexity (managing/tuning clusters).</li> + <li>Operational complexity (managing/tuning clusters, compaction strategies).</li> <li>Limited ad-hoc querying capabilities.</li> </ul> <h6>Use Cases</h6> @@ -694,11 +731,11 @@ </div> <div class="collapse collapse-content" id="collapseNoSQL4"> <h6>Philosophy & Core</h6> - <p>Relationships are first-class citizens. Data is modeled as Nodes (entities) and Relationships (edges connecting nodes), both of which can have properties. Optimized for traversing these connections efficiently (performance often independent of total dataset size).</p> + <p>Relationships are first-class citizens. Data is modeled as Nodes (entities) and Relationships (edges connecting nodes), both of which can have properties. Optimized for traversing these connections efficiently (performance often independent of total dataset size for local traversals).</p> <h6>Schema & Consistency</h6> - <p>Typically schema-flexible or optional schema. Structure emerges from the graph connections. Neo4j provides ACID compliance for transactions within a single instance or causal consistency in clustered deployments.</p> + <p>Typically schema-flexible or optional schema (constraints can enforce structure). Structure emerges from the graph connections. Neo4j provides ACID compliance for transactions within a single instance or causal consistency in clustered deployments (read-your-writes).</p> <h6>Scalability</h6> - <p>Vertical scaling is common. Horizontal scaling is achieved via Causal Clustering (primarily for read scaling and HA). Scaling writes across a cluster involves careful data modeling and architecture, potentially more complex than sharding in other NoSQL types.</p> + <p>Vertical scaling is common. Horizontal scaling is achieved via Causal Clustering (primarily for read scaling and HA). Scaling writes across a cluster involves Fabric (sharding), which adds complexity and has different consistency guarantees depending on the query.</p> <h6>Strengths</h6> <ul> <li>Ideal for managing and querying highly interconnected data.</li> @@ -712,7 +749,7 @@ <li>Not optimized for queries requiring aggregation over the entire dataset (e.g., counting all nodes of a certain type across the whole graph).</li> <li>May be less suitable for simple tabular data structures or bulk updates unrelated to relationships.</li> <li>Requires learning a different query paradigm (Cypher).</li> - <li>Write scaling across large clusters can be challenging.</li> + <li>Write scaling across large clusters can be challenging to design and manage effectively.</li> </ul> <h6>Use Cases</h6> <p>Social networks, recommendation engines, fraud detection, network and IT operations mapping, identity and access management, knowledge graphs, supply chain management, bioinformatics.</p> @@ -720,6 +757,7 @@ </div> </div> <div class="col-lg-3 col-md-6"> + <!-- Note: Adjusted col size for better spacing --> <div class="info-card db-type-search" id="card-elasticsearch"> <div class="card-body"> <h5><i class="bi bi-search"></i> Elasticsearch</h5> @@ -734,7 +772,7 @@ <h6>Philosophy & Core</h6> <p>Provide powerful and fast search and analytics capabilities over large volumes of (primarily text) data. Built on Apache Lucene, it uses inverted indices for efficient term lookups. Often used alongside a primary database.</p> <h6>Schema & Consistency</h6> - <p>Document-oriented (stores JSON documents). Primarily Schema-on-Read, but defining explicit mappings (schema) is highly recommended for performance, relevance tuning, and controlling data types. Operates with near real-time (NRT) consistency – changes are typically searchable within seconds (eventual consistency). Distributed nature prioritizes Availability/Partition Tolerance (AP).</p> + <p>Document-oriented (stores JSON documents). Primarily Schema-on-Read, but defining explicit mappings (schema) is highly recommended for performance, relevance tuning, and controlling data types. Operates with near real-time (NRT) consistency – changes are typically searchable within seconds (eventual consistency, default refresh interval 1s). Distributed nature prioritizes Availability/Partition Tolerance (AP).</p> <h6>Scalability</h6> <p>Excellent Horizontal Scalability. Designed as a distributed system using nodes, indices (like databases), and shards (partitions of an index). Adding nodes increases storage capacity, indexing throughput, and query performance.</p> <h6>Strengths</h6> @@ -751,8 +789,9 @@ <li>Not designed as a primary transactional store (lacks traditional ACID transactions across multiple documents).</li> <li>Eventual consistency means writes aren't immediately searchable.</li> <li>Can require significant resources (RAM, CPU, disk I/O) due to indexing overhead.</li> - <li>Cluster management, tuning, and query optimization can be complex.</li> + <li>Cluster management, tuning (shard sizing, JVM), and query optimization can be complex.</li> <li>Updates require reindexing documents (though optimized internally).</li> + <li>Potential for split-brain scenarios in older versions or misconfigured clusters.</li> </ul> <h6>Use Cases</h6> <p>Website/application search, log aggregation and analysis (ELK Stack), application performance monitoring (APM), security information and event management (SIEM), business intelligence dashboards, geospatial search, product search and filtering.</p> @@ -762,16 +801,297 @@ </div> <!-- /.row --> </div> <!-- /.schema-container --> + + <!-- NEW SECTION: MODERN ENGINES --> + <div class="schema-container cat-modern" data-section-id="section-modern"> + <h2 class="section-title" id="section-modern">Modern Engines (Emerging & Specialized)</h2> + <div class="row"> + + <!-- TimescaleDB --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-modern" id="card-timescale"> + <div class="card-body"> + <h5><i class="bi bi-clock-history"></i> TimescaleDB</h5> + <div class="card-content-wrapper"> + <p class="summary">Postgres extension for <span class="term">time-series</span>; automatic partitioning (<span class="term">hypertables</span>), columnar compression, specialized functions.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseModern1" aria-expanded="false" aria-controls="collapseModern1">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseModern1"> + <h6>Philosophy & Core</h6> + <p>Extend PostgreSQL with specialized capabilities for time-series data while retaining full SQL compatibility and the Postgres ecosystem. Uses automatic time/space partitioning (hypertables, chunks) for efficient data management and querying.</p> + <h6>Scalability & Consistency</h6> + <p>Inherits Postgres's vertical scaling and ACID properties. Horizontal scaling via multi-node deployments (distributed hypertables) is available, adding complexity similar to other distributed SQL systems. Read scaling via PG replicas works seamlessly.</p> + <h6>Strengths</h6> + <ul> + <li>Leverages existing Postgres knowledge and tooling.</li> + <li>Automatic partitioning simplifies management of large time-series datasets.</li> + <li>Built-in columnar compression significantly reduces storage.</li> + <li>Specialized time-series functions (e.g., `time_bucket`, `first`, `last`, continuous aggregates).</li> + <li>Data retention policies and downsampling built-in.</li> + </ul> + <h6>Trade-offs</h6> + <ul> + <li>Write performance for very high ingest rates can still be limited by the underlying Postgres architecture (though significantly improved over vanilla PG for time-series).</li> + <li>Distributed hypertables add operational complexity.</li> + <li>Some advanced features might require licensed versions or cloud offerings.</li> + </ul> + <h6>Use Cases</h6> + <p>IoT sensor data, application & infrastructure monitoring/metrics, financial market data (ticks), industrial telemetry, geospatial time-series tracking.</p> + </div> + </div> + </div> + + <!-- ClickHouse --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-modern" id="card-clickhouse"> + <div class="card-body"> + <h5><i class="bi bi-lightning-charge"></i> ClickHouse</h5> + <div class="card-content-wrapper"> + <p class="summary">Blazing fast open-source <span class="term">columnar OLAP</span> DB; vectorized query execution, real-time analytics on petabytes.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseModern2" aria-expanded="false" aria-controls="collapseModern2">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseModern2"> + <h6>Philosophy & Core</h6> + <p>Designed from the ground up for extreme OLAP query speed. Uses columnar storage, vectorized query execution (processing data in batches using CPU SIMD instructions), and data compression for high performance on analytical workloads.</p> + <h6>Scalability & Consistency</h6> + <p>Excellent horizontal scalability via native sharding and replication. Typically operates with eventual consistency across replicas. Data consistency within a single node is strong for batch inserts. Lacks traditional row-level updates/deletes and transactional guarantees (designed for append-heavy workloads).</p> + <h6>Strengths</h6> + <ul> + <li>Extremely fast query performance, often processing billions of rows per second per server.</li> + <li>Highly efficient storage due to columnar format and compression codecs.</li> + <li>Built for horizontal scaling (sharding/replication).</li> + <li>SQL-like query language with extensions for analytical tasks.</li> + <li>Supports various data formats and integrations (Kafka, S3).</li> + </ul> + <h6>Trade-offs</h6> + <ul> + <li>Not suitable for OLTP workloads (no transactions, slow point lookups/updates/deletes).</li> + <li>Eventual consistency across replicas.</li> + <li>Steeper learning curve for data modeling (denormalization is key) and cluster operations compared to traditional RDBMS.</li> + <li>Resource intensive (CPU, RAM) during complex queries.</li> + </ul> + <h6>Use Cases</h6><p>Real-time interactive dashboards, log and event analysis, telemetry and monitoring data, ad tech analytics, financial reporting, security analytics, large-scale data exploration.</p> + </div> + </div> + </div> + + <!-- DuckDB --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-modern" id="card-duckdb"> + <div class="card-body"> + <h5><i class="bi bi-quack"></i> DuckDB</h5> <!-- Changed icon to something neutral as bi-duck not standard --> + <div class="card-content-wrapper"> + <p class="summary">In-process <span class="term">analytical data management system</span> (OLAP SQLite); columnar-vectorized engine, runs anywhere.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseModern3" aria-expanded="false" aria-controls="collapseModern3">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseModern3"> + <h6>Philosophy & Core</h6> + <p>Provide the power of columnar-vectorized analytical query processing in an easy-to-deploy, embedded, serverless package (like SQLite for analytics). Runs within the host process (Python, R, Java, C++, etc.).</p> + <h6>Scalability & Consistency</h6> + <p>Primarily single-node, leveraging multi-core parallelism effectively. Data stored in a single file or can query external formats (Parquet, CSV, Arrow). ACID compliant for operations within a single process connection.</p> + <h6>Strengths</h6> + <ul> + <li>Zero-dependency installation, easy embedding.</li> + <li>Very fast analytical queries on local data or remote files (e.g., S3).</li> + <li>Excellent integration with data science tools (Pandas, Arrow).</li> + <li>Full SQL support with analytical extensions.</li> + <li>Handles larger-than-memory datasets through out-of-core processing.</li> + </ul> + <h6>Trade-offs</h6> + <ul> + <li>Primarily single-node (no built-in distributed query execution).</li> + <li>Designed for analytical queries, not high-concurrency OLTP.</li> + <li>Write performance less optimized than OLTP databases.</li> + <li>Concurrency limited by single-process access or file locking.</li> + </ul> + <h6>Use Cases</h6><p>Interactive data analysis in notebooks (Python/R), local BI tools, data transformation/ETL pipelines, replacing scripts processing flat files, teaching SQL and analytics, testing analytical queries.</p> + </div> + </div> + </div> + + <!-- pgvector / Milvus --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-modern" id="card-vector"> + <div class="card-body"> + <h5><i class="bi bi-intersect"></i> Vector DBs (e.g., Milvus, Pinecone, Weaviate, pgvector)</h5> <!-- Changed icon --> + <div class="card-content-wrapper"> + <p class="summary">Specialized for storing & querying high-dimensional <span class="term">vector embeddings</span>; powers similarity search, recommendations, RAG.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseModern4" aria-expanded="false" aria-controls="collapseModern4">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseModern4"> + <h6>Philosophy & Core</h6> + <p>Efficiently store and search dense vector representations (embeddings) of data (text, images, audio). Use Approximate Nearest Neighbor (ANN) algorithms (like HNSW, IVF_FLAT) to find vectors similar to a query vector quickly, trading perfect accuracy for speed.</p> + <h6>Scalability & Consistency</h6> + <p>Varies greatly: + <ul> + <li><span class="term">Extensions (pgvector):</span> Inherit host DB's scale/consistency (Postgres). Simpler ops, potentially limited scale/performance for huge vector workloads.</li> + <li><span class="term">Standalone DBs (Milvus, Weaviate):</span> Often distributed, horizontally scalable (sharding/replication). Typically offer tunable eventual consistency. Optimized for vector workloads.</li> + <li><span class="term">Managed Services (Pinecone):</span> Cloud-native, handle scaling transparently. Consistency models specific to provider.</li> + </ul> + </p> + <h6>Strengths</h6> + <ul> + <li>Fast similarity search over millions/billions of vectors (sub-second).</li> + <li>Support various ANN index types and distance metrics (cosine, L2).</li> + <li>Can store associated metadata alongside vectors.</li> + <li>Enables semantic search, recommendations, anomaly detection based on meaning/similarity.</li> + </ul> + <h6>Trade-offs</h6> + <ul> + <li>ANN search is approximate (may miss some true neighbors). Tuning recall vs. latency/cost is critical.</li> + <li>Indexing can be computationally expensive and RAM-heavy.</li> + <li>Operational complexity for self-managed standalone vector DBs.</li> + <li>Rapidly evolving field with changing APIs and best practices.</li> + <li>Requires a separate process to generate embeddings (usually ML models).</li> + </ul> + <h6>Use Cases</h6><p>Retrieval-Augmented Generation (RAG) for LLMs, semantic search, image/audio similarity search, recommendation systems, duplicate detection, anomaly detection, clustering.</p> + </div> + </div> + </div> + + </div><!-- /.row --> + </div><!-- /.schema-container --> + <!-- END NEW SECTION: MODERN ENGINES --> + + <!-- NEW SECTION: INDEX TYPES --> + <div class="schema-container cat-index" data-section-id="section-index"> + <h2 class="section-title" id="section-index">Common Index Types & Guidance</h2> + <div class="row"> + + <!-- B-Tree / Clustered --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-index" id="card-btree"> + <div class="card-body"> + <h5><i class="bi bi-list-ol"></i> B-Tree / B+Tree</h5> + <div class="card-content-wrapper"> + <p class="summary">Default index for most databases. Excellent for <span class="term">equality (=)</span> and <span class="term">range queries</span> (<, >, BETWEEN) on ordered data types.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseIdx1" aria-expanded="false" aria-controls="collapseIdx1">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseIdx1"> + <h6>Mechanism</h6> + <p>A self-balancing tree structure that keeps data sorted and allows searches, sequential access, insertions, and deletions in logarithmic time O(log n).</p> + <h6>Variations</h6> + <ul> + <li><strong>Clustered Index:</strong> Determines the physical storage order of rows in the table (e.g., SQL Server default PK, MySQL InnoDB PK). Only one per table.</li> + <li><strong>Non-Clustered Index (Heap):</strong> Contains index keys and pointers (row locators) to the actual data rows, which are stored separately (often in a heap). Multiple allowed per table.</li> + </ul> + <h6>Strengths</h6> + <ul><li>Efficient for exact matches, range scans, sorting (ORDER BY).</li></ul> + <h6>Weaknesses</h6> + <ul><li>Less effective for pattern matching like `LIKE '%substring%'`.</li><li>Not ideal for multi-dimensional data or unordered set lookups.</li></ul> + <h6>Use Cases</h6><p>Primary keys, foreign keys, columns frequently used in WHERE clauses with equality/range conditions, ORDER BY columns.</p> + </div> + </div> + </div> + + <!-- GiST / GIN --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-index" id="card-gistgin"> + <div class="card-body"> + <h5><i class="bi bi-bounding-box"></i> GiST / GIN (Postgres)</h5> <!-- Changed icon --> + <div class="card-content-wrapper"> + <p class="summary">Postgres generalized index structures enabling indexing of <span class="term">complex/composite types</span> (spatial, text, arrays, JSONB).</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseIdx2" aria-expanded="false" aria-controls="collapseIdx2">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseIdx2"> + <h6>GiST (Generalized Search Tree)</h6> + <ul> + <li>Lossy index structure; versatile framework for various data types (R-Tree for spatial, etc.).</li> + <li>Good balance between build time, size, and query speed.</li> + <li>Can support "nearest neighbor" searches.</li> + <li>Use cases: PostGIS geometry/geography, range types, some full-text search.</li> + </ul> + <h6>GIN (Generalized Inverted Index)</h6> + <ul> + <li>Optimized for indexing composite values containing multiple components (elements of an array, key-value pairs in JSONB, words in text).</li> + <li>Typically faster for lookups than GiST when searching for specific elements within composite types.</li> + <li>Can be slower to build/update and larger than GiST.</li> + <li>Use cases: `jsonb` field lookups (`@>`, `?`), array containment (`<@`, `@>`), full-text search (tsvector).</li> + </ul> + <h6>Note</h6><p>Choice depends on data type, query patterns, and tolerance for build time vs. query time.</p> + </div> + </div> + </div> + + <!-- LSM / SSTable --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-index" id="card-lsm"> + <div class="card-body"> + <h5><i class="bi bi-stack"></i> LSM Tree</h5> + <div class="card-content-wrapper"> + <p class="summary">Log-Structured Merge-Tree. Optimized for <span class="term">high write throughput</span> by batching writes in memory (<span class="term">memtable</span>) and flushing sequentially to immutable disk files (<span class="term">SSTables</span>).</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseIdx3" aria-expanded="false" aria-controls="collapseIdx3">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseIdx3"> + <h6>Mechanism</h6> + <p>Writes append to an in-memory memtable and a commit log. When memtable is full, it's flushed to disk as a sorted SSTable. Reads must check memtable and potentially multiple SSTables. Background compaction merges SSTables to remove deleted/updated data and improve read efficiency.</p> + <h6>Strengths</h6> + <ul> + <li>Excellent sequential write performance (avoids random I/O).</li> + <li>Good compression potential within SSTables.</li> + </ul> + <h6>Weaknesses</h6> + <ul> + <li>Read performance can suffer (read amplification) as data for a key might exist in multiple SSTables (mitigated by Bloom filters).</li> + <li>Space amplification due to retaining old data until compaction.</li> + <li>Compaction process consumes background I/O and CPU resources.</li> + </ul> + <h6>Use Cases</h6><p>Databases prioritizing write speed and scalability: Cassandra, HBase, RocksDB (used by MyRocks, TiKV), LevelDB, ClickHouse (MergeTree engine family).</p> + </div> + </div> + </div> + + <!-- BRIN + Zonemap --> + <div class="col-lg-3 col-md-6"> + <div class="info-card db-type-index" id="card-brin"> + <div class="card-body"> + <h5><i class="bi bi-distribute-vertical"></i> BRIN / Zonemap</h5> <!-- Changed Icon --> + <div class="card-content-wrapper"> + <p class="summary">Block Range Index (Postgres) / Zonemaps (Other systems). Stores <span class="term">summary info</span> (min/max) for large blocks of table rows. Very small, effective for <span class="term">naturally ordered data</span>.</p> + <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseIdx4" aria-expanded="false" aria-controls="collapseIdx4">Details <i class="bi bi-chevron-down"></i></button> + </div> + </div> + <div class="collapse collapse-content" id="collapseIdx4"> + <h6>Mechanism</h6> + <p>Divides the table into large ranges of physical blocks. For each range, stores the minimum and maximum value of the indexed column(s) found within those blocks. Queries can quickly skip ranges that cannot contain the target value(s).</p> + <h6>Strengths</h6> + <ul> + <li>Extremely small index size (orders of magnitude smaller than B-Tree).</li> + <li>Very fast to build and maintain.</li> + <li>Effective at pruning large portions of the table for queries on correlated columns (e.g., timestamp in append-only log tables).</li> + </ul> + <h6>Weaknesses</h6> + <ul> + <li>Only effective if the indexed column's values have a strong correlation with their physical storage location (e.g., append-only data like timestamps, monotonically increasing IDs).</li> + <li>Less effective than B-Tree for selective lookups of specific values if data is not physically ordered.</li> + </ul> + <h6>Use Cases</h6><p>Large log tables, time-series data partitioned by time, fact tables ordered by date/ID, any large table where data is naturally clustered physically based on the indexed column.</p> + </div> + </div> + </div> + + </div><!-- /.row --> + </div><!-- /.schema-container --> + <!-- END NEW SECTION: INDEX TYPES --> + + <!-- 4. KEY DESIGN CONSIDERATIONS (TRADEOFFS) --> <div class="schema-container cat-tradeoff" data-section-id="section-tradeoffs"> - <h2 class="section-title" id="section-tradeoffs">Key Design Considerations</h2> + <h2 class="section-title" id="section-tradeoffs">Key Design Considerations & Tradeoffs</h2> <div class="row"> <div class="col-lg-4 col-md-6"> <div class="info-card db-type-tradeoff" id="card-tradeoff-cap"> <div class="card-body"> - <h5><i class="bi bi-shield-exclamation"></i> Consistency vs. Availability</h5> + <h5><i class="bi bi-shield-exclamation"></i> Consistency vs. Availability (CAP)</h5> <div class="card-content-wrapper"> - <p class="summary">During network partitions, distributed systems must often choose between guaranteeing <span class="term">up-to-date data (C)</span> or <span class="term">always responding (A)</span>. Critical tradeoff (CAP).</p> + <p class="summary">During network partitions, distributed systems must often choose between guaranteeing <span class="term">up-to-date data (C)</span> or <span class="term">always responding (A)</span>, assuming Partition Tolerance (P) is required.</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff1" aria-expanded="false" aria-controls="collapseTradeoff1"> Explore <i class="bi bi-chevron-down"></i> </button> @@ -780,8 +1100,8 @@ <div class="collapse collapse-content" id="collapseTradeoff1"> <p>As described by the CAP Theorem, when a network partition prevents nodes in a distributed database from communicating reliably, a choice must be made:</p> <ul> - <li><strong>Prioritize Consistency (CP):</strong> The system refuses to respond (becomes unavailable) if it cannot guarantee that the data read or written is consistent across the partition. This prevents stale reads or conflicting writes but impacts uptime during network issues. <br/><em>Example Use Case:</em> Financial transactions where data accuracy is non-negotiable.</li> - <li><strong>Prioritize Availability (AP):</strong> The system continues to respond to requests even if it cannot guarantee consistency across the partition. Nodes might serve potentially stale data or accept writes that need later reconciliation. This maximizes uptime but requires applications to handle potential inconsistencies.<br/><em>Example Use Case:</em> Social media feeds or shopping carts where showing slightly old data is often acceptable to keep the service responsive.</li> + <li><strong>Prioritize Consistency (CP):</strong> The system refuses to respond (becomes unavailable) if it cannot guarantee that the data read or written is consistent across the partition. This prevents stale reads or conflicting writes but impacts uptime during network issues. <br/><em>Example Systems/Configs:</em> Strongly consistent RDBMS clusters, HBase, some MongoDB/Redis Cluster configurations. <br/><em>Example Use Case:</em> Financial transactions where data accuracy is non-negotiable.</li> + <li><strong>Prioritize Availability (AP):</strong> The system continues to respond to requests even if it cannot guarantee consistency across the partition. Nodes might serve potentially stale data or accept writes that need later reconciliation. This maximizes uptime but requires applications to handle potential inconsistencies.<br/><em>Example Systems/Configs:</em> Cassandra, DynamoDB, Riak, most default NoSQL configurations, eventual consistency modes. <br/><em>Example Use Case:</em> Social media feeds, shopping carts, product views where showing slightly old data is often acceptable to keep the service responsive.</li> </ul> <p>Understanding the application's tolerance for inconsistency vs. downtime is crucial when selecting a distributed database and configuring its consistency levels.</p> </div> @@ -790,31 +1110,31 @@ <div class="col-lg-4 col-md-6"> <div class="info-card db-type-tradeoff" id="card-tradeoff-pacelc"> <div class="card-body"> - <h5><i class="bi bi-hourglass-split"></i> Latency vs. Consistency</h5> + <h5><i class="bi bi-hourglass-split"></i> Latency vs. Consistency (PACELC)</h5> <div class="card-content-wrapper"> - <p class="summary">Even without partitions, systems often trade <span class="term">lower latency (L)</span> for <span class="term">weaker consistency (C)</span>, or vice versa (PACELC).</p> + <p class="summary">Even without partitions (Else case), systems often trade <span class="term">lower latency (L)</span> for <span class="term">weaker consistency (C)</span>, or vice versa. Extends CAP.</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff2" aria-expanded="false" aria-controls="collapseTradeoff2"> Explore <i class="bi bi-chevron-down"></i> </button> </div> </div> <div class="collapse collapse-content" id="collapseTradeoff2"> - <p>The PACELC theorem extends CAP by observing that even in the absence of Partitions (P), distributed systems face another fundamental tradeoff: choosing between lower <span class="term">Latency (L)</span> and higher <span class="term">Consistency (C)</span> during normal operation.</p> + <p>The PACELC theorem extends CAP by observing that distributed systems face another fundamental tradeoff *during normal operation* (Else case, when there's no Partition): choosing between lower <span class="term">Latency (L)</span> and higher <span class="term">Consistency (C)</span>.</p> <p>The theorem states: If there is a Partition (P), how does the system trade off Availability (A) versus Consistency (C)? Else (E), when operating normally, how does the system trade off Latency (L) versus Consistency (C)?</p> <ul> - <li><strong>Prioritize Consistency (PC/EC):</strong> Achieving higher consistency (e.g., ensuring a write is replicated to multiple nodes before acknowledging success) often requires more network communication and coordination, increasing the time (latency) it takes to complete an operation.</li> - <li><strong>Prioritize Latency (PA/EL):</strong> Responding faster might mean acknowledging a write after it's only been accepted by one or a few nodes, relying on asynchronous replication to propagate it later. This reduces latency but introduces a window where reads might see stale data (lower consistency).</li> + <li><strong>Prioritize Consistency (PC/EC - High C, Higher L):</strong> Achieving higher consistency (e.g., synchronous replication to a quorum of nodes before acknowledging a write) often requires more network round trips and coordination, increasing the time (latency) it takes to complete an operation.<br/><em>Example Systems:</em> Paxos/Raft based systems like etcd, ZooKeeper, strongly consistent RDBMS.</li> + <li><strong>Prioritize Latency (PA/EL - Low L, Lower C):</strong> Responding faster might mean acknowledging a write after it's only been accepted locally or by one replica, relying on asynchronous replication to propagate it later. This reduces latency but introduces a window where reads might see stale data (lower consistency). <br/><em>Example Systems:</em> Cassandra (tunable), Redis (async replication), many default NoSQL settings.</li> </ul> - <p>Different databases optimize for different points on this spectrum. For example, systems designed for strong consistency might inherently have higher latency than those prioritizing speed with eventual consistency.</p> + <p>Different databases optimize for different points on this spectrum (e.g., DynamoDB often described as PA/EL, BigTable/HBase as PC/EC, Cassandra as PA/EL but tunable towards EC). Understanding this helps select systems matching application needs for speed vs. data freshness.</p> </div> </div> </div> <div class="col-lg-4 col-md-6"> <div class="info-card db-type-tradeoff" id="card-tradeoff-scaling"> <div class="card-body"> - <h5><i class="bi bi-arrows-angle-expand"></i> Scale Strategy</h5> + <h5><i class="bi bi-arrows-angle-expand"></i> Scale Strategy: Vertical vs. Horizontal</h5> <div class="card-content-wrapper"> - <p class="summary"><span class="term">Vertical scaling</span> is simpler initially but limited. <span class="term">Horizontal scaling</span> offers greater potential but adds complexity. Choice depends on growth needs.</p> + <p class="summary"><span class="term">Vertical scaling</span> (Scale-Up) is simpler initially but hits limits. <span class="term">Horizontal scaling</span> (Scale-Out) offers greater potential but adds operational/architectural complexity.</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff3" aria-expanded="false" aria-controls="collapseTradeoff3"> Explore <i class="bi bi-chevron-down"></i> </button> @@ -825,29 +1145,29 @@ <ul> <li><strong>Vertical Scaling (Scale-Up):</strong> <ul> - <li><em>Approach:</em> Add more power (CPU, RAM, faster disks) to the existing server.</li> + <li><em>Approach:</em> Add more power (CPU, RAM, faster disks/NVMe) to the existing server.</li> <li><em>Best For:</em> Applications with moderate growth expectations, systems where operational simplicity is paramount initially, traditional monolithic applications. Often the default starting point for SQL databases.</li> - <li><em>Limits:</em> Physical hardware constraints, cost increases non-linearly at the high end, potential single point of failure.</li> + <li><em>Limits:</em> Physical hardware constraints (max CPU/RAM per socket/board), cost increases non-linearly at the high end, potential single point of failure, downtime often needed for upgrades.</li> </ul> </li> <li><strong>Horizontal Scaling (Scale-Out):</strong> <ul> - <li><em>Approach:</em> Add more servers (nodes) and distribute the load/data (replication, sharding).</li> - <li><em>Best For:</em> Applications expecting massive growth, systems requiring high availability and fault tolerance, cloud-native applications. The native approach for most NoSQL databases.</li> - <li><em>Limits:</em> Increased architectural and operational complexity (managing clusters, network latency, consistency issues), potential need for application changes to leverage distribution effectively.</li> + <li><em>Approach:</em> Add more commodity servers (nodes) and distribute the load/data (e.g., replication, sharding/partitioning, clustering).</li> + <li><em>Best For:</em> Applications expecting massive growth or spiky loads, systems requiring high availability and fault tolerance, cloud-native architectures. The native approach for most NoSQL databases and modern distributed SQL.</li> + <li><em>Limits:</em> Increased architectural complexity (sharding logic, distributed transactions), operational overhead (managing clusters, network latency, consistency across nodes), potential "hot spots" if sharding key isn't chosen well, inter-node communication overhead.</li> </ul> </li> </ul> - <p>Many modern systems use a combination, starting vertical and implementing horizontal strategies (like read replicas) as needed, or using databases specifically designed for horizontal scaling from the start.</p> + <p>Many modern systems use a combination, starting vertical and implementing horizontal strategies (like read replicas) as needed, or using databases specifically designed for horizontal scaling from the start (e.g., Cassandra, Vitess, CockroachDB).</p> </div> </div> </div> <div class="col-lg-4 col-md-6"> <div class="info-card db-type-tradeoff" id="card-tradeoff-schema"> <div class="card-body"> - <h5><i class="bi bi-pencil-square"></i> Schema Flexibility</h5> + <h5><i class="bi bi-pencil-square"></i> Schema Flexibility: Rigid vs. Flexible</h5> <div class="card-content-wrapper"> - <p class="summary"><span class="term">Rigid schemas</span> (SQL) enforce data quality but require migrations. <span class="term">Flexible schemas</span> (NoSQL) speed development but can complicate querying.</p> + <p class="summary"><span class="term">Schema-on-Write</span> (SQL) enforces data quality/predictability but requires migrations. <span class="term">Schema-on-Read</span> (NoSQL) speeds iteration but shifts validation burden to application/query time.</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff4" aria-expanded="false" aria-controls="collapseTradeoff4"> Explore <i class="bi bi-chevron-down"></i> </button> @@ -858,53 +1178,55 @@ <ul> <li><strong>Rigid Schema (Schema-on-Write):</strong> <ul> - <li><em>Benefit:</em> Ensures data consistency, integrity, and predictability. Clearly defines the expected data structure. Optimizes storage and indexing.</li> - <li><em>Drawback:</em> Requires upfront design effort. Adapting to changes requires formal schema migrations, which can be complex, slow, and potentially require downtime. Less suitable for unstructured or rapidly evolving data.</li> + <li><em>Benefit:</em> Ensures data consistency, integrity, and predictability. Clearly defines the expected data structure. Optimizes storage and indexing based on known types. Self-documenting structure.</li> + <li><em>Drawback:</em> Requires upfront design effort. Adapting to changes requires formal schema migrations (e.g., `ALTER TABLE`), which can be complex, slow, lock-intensive, and potentially require downtime for large tables. Less suitable for unstructured or rapidly evolving data.</li> <li><em>Typical Databases:</em> SQL databases (Postgres, MySQL, SQL Server, SQLite).</li> </ul> </li> <li><strong>Flexible Schema (Schema-on-Read / Schema-less):</strong> <ul> - <li><em>Benefit:</em> Allows for rapid development and easy adaptation to changing requirements. Can handle diverse and evolving data structures easily. No need for complex migrations for simple field additions.</li> - <li><em>Drawback:</em> Can lead to inconsistent or messy data if not managed carefully at the application layer. Querying data with heterogeneous structures can be complex and less performant. Data meaning might depend on application logic.</li> - <li><em>Typical Databases:</em> Document DBs (MongoDB), Key-Value Stores (Redis), Wide-Column Stores (Cassandra). (Note: Even flexible systems often benefit from some level of schema definition or validation).</li> + <li><em>Benefit:</em> Allows for rapid development and easy adaptation to changing requirements or diverse data sources. Can handle heterogeneous structures easily (e.g., different attributes per document). No blocking migrations for simple field additions.</li> + <li><em>Drawback:</em> Can lead to inconsistent or "dirty" data if not managed carefully via application logic or optional validation rules. Querying data with varying structures can be complex or inefficient. Data meaning might be ambiguous without application context or documentation. Potential for increased storage size if field names are repeated.</li> + <li><em>Typical Databases:</em> Document DBs (MongoDB), Key-Value Stores (Redis), Wide-Column Stores (Cassandra). (Note: Even flexible systems often benefit from some level of schema definition/validation for critical fields).</li> </ul> </li> </ul> + <p>The choice depends on the volatility of the data structure, the need for strict integrity enforcement vs. development speed, and the team's discipline in managing flexible schemas.</p> </div> </div> </div> <div class="col-lg-4 col-md-6"> <div class="info-card db-type-tradeoff" id="card-tradeoff-query"> <div class="card-body"> - <h5><i class="bi bi-question-diamond"></i> Query Complexity</h5> + <h5><i class="bi bi-question-diamond"></i> Query Complexity & Patterns</h5> <div class="card-content-wrapper"> - <p class="summary"><span class="term">SQL</span> excels at complex JOINs and ad-hoc queries. <span class="term">NoSQL</span> often optimizes for specific, simpler query patterns for high performance.</p> + <p class="summary"><span class="term">SQL</span> excels at complex JOINs, aggregations, and ad-hoc analysis across normalized data. <span class="term">NoSQL</span> often optimizes for simpler, high-velocity query patterns specific to its data model (key lookups, document retrieval, graph traversals).</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff5" aria-expanded="false" aria-controls="collapseTradeoff5"> Explore <i class="bi bi-chevron-down"></i> </button> </div> </div> <div class="collapse collapse-content" id="collapseTradeoff5"> - <p>The database's design influences the types of queries it handles efficiently.</p> + <p>The database's design profoundly influences the types of queries it handles efficiently.</p> <ul> <li><strong>SQL Databases:</strong> <ul> - <li><em>Strength:</em> Designed for complex, ad-hoc queries involving multiple tables (JOINs), aggregations, subqueries, and filtering on various columns. The relational model and SQL language provide a powerful, standardized way to access related data.</li> - <li><em>Consideration:</em> Performance of very complex JOINs across huge tables can still be a challenge, requiring careful indexing and query optimization.</li> + <li><em>Strength:</em> Designed for complex, ad-hoc queries involving multiple related tables (JOINs), sophisticated aggregations (GROUP BY, window functions), subqueries, and filtering on various columns. The relational model and declarative SQL language provide a powerful, standardized way to interrogate structured data. Mature query optimizers.</li> + <li><em>Consideration:</em> Performance of very complex JOINs or full table scans on huge tables requires careful indexing, query tuning, and potentially denormalization. Can struggle with graph-like traversals (recursive CTEs can be complex/slow).</li> </ul> </li> <li><strong>NoSQL Databases:</strong> <ul> - <li><em>Strength:</em> Often optimized for very high performance on specific query patterns based on their data model: + <li><em>Strength:</em> Often optimized for very high performance and scalability on specific, often simpler, query patterns based on their data model: <ul> - <li><em>Key-Value:</em> Extremely fast lookups by key.</li> - <li><em>Document:</em> Fast retrieval/update of entire documents or querying fields within a document.</li> - <li><em>Wide-Column:</em> Efficient retrieval of specific columns for a range of rows.</li> - <li><em>Graph:</em> Fast traversal of relationships between nodes.</li> + <li><em>Key-Value (Redis):</em> Blazing fast O(1) lookups by primary key.</li> + <li><em>Document (MongoDB):</em> Fast retrieval/update of entire documents; efficient querying on indexed fields within documents, including nested structures/arrays.</li> + <li><em>Wide-Column (Cassandra):</em> Efficient retrieval of specific columns for a range of rows based on partition/clustering keys. Optimized for writes.</li> + <li><em>Graph (Neo4j):</em> Extremely fast traversal of relationships (finding paths, neighbors) starting from known nodes.</li> + <li><em>Search (Elasticsearch):</em> Powerful full-text search, relevance scoring, complex filtering, and aggregations across documents.</li> </ul> </li> - <li><em>Consideration:</em> Queries that don't fit the optimized pattern (e.g., JOIN-like operations across different document types in a Document DB, complex filtering on non-indexed fields, graph-wide aggregations) might be inefficient, require multiple application-level queries, or may not be possible directly. Data modeling often needs to be done with specific query patterns in mind ("query-first design").</li> + <li><em>Consideration:</em> Queries that don't fit the optimized access pattern (e.g., JOIN-like operations across collections/buckets, filtering on non-indexed fields, graph-wide scans, full-text search in non-search DBs) might be inefficient, require multiple application-level lookups, necessitate data duplication/denormalization, or be impossible directly. Data modeling often requires a "query-first" design approach.</li> </ul> </li> </ul> @@ -916,23 +1238,24 @@ <div class="card-body"> <h5><i class="bi bi-puzzle"></i> Data Model Fit</h5> <div class="card-content-wrapper"> - <p class="summary">Choose a database whose <span class="term">native data model</span> (tables, documents, key-value, graph) best represents your application's primary data structures.</p> + <p class="summary">Choose a database whose <span class="term">native data model</span> (tables, documents, key-value, graph, column-family, vectors) best represents and serves the primary entities and access patterns of your application.</p> <button class="btn btn-sm details-toggle" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTradeoff6" aria-expanded="false" aria-controls="collapseTradeoff6"> Explore <i class="bi bi-chevron-down"></i> </button> </div> </div> <div class="collapse collapse-content" id="collapseTradeoff6"> - <p>Aligning your application's data structure with the database's native model significantly impacts development ease and performance.</p> + <p>Aligning your application's conceptual data structure and primary query needs with the database's native model significantly impacts development ease, performance, and scalability.</p> <ul> - <li><strong>Relational Model (SQL - Tables):</strong> Best fit for highly structured data with well-defined relationships that need to be enforced, requiring complex queries across different entities. Examples: Financial records, inventory systems, user accounts with complex permissions.</li> - <li><strong>Document Model (MongoDB):</strong> Ideal for semi-structured data where each entity might have varying attributes, or data naturally nests (like blog posts with comments). Good when data maps closely to application objects. Examples: Content management, product catalogs, user profiles.</li> - <li><strong>Key-Value Model (Redis):</strong> Suitable when the primary access pattern is retrieving a value based on a known key, requiring very low latency. Examples: Caching, session state, user preferences lookups.</li> - <li><strong>Wide-Column Model (Cassandra):</strong> Good for handling massive datasets where access is typically by row key, but the columns needed per row might vary or be very numerous (sparse data). Examples: Time-series data, IoT sensor readings, activity logs.</li> - <li><strong>Graph Model (Neo4j):</strong> The best choice when the relationships *between* data points are as important as the data points themselves, and queries involve traversing these connections. Examples: Social networks, recommendation systems, fraud detection rings.</li> - <li><strong>Search Model (Elasticsearch):</strong> Optimal for text-heavy data requiring sophisticated search capabilities (relevance, faceting, geospatial) or complex aggregations for analytics. Examples: Log analysis, product search, document repositories.</li> + <li><strong>Relational Model (SQL - Tables):</strong> Best fit for highly structured data with well-defined relationships that need enforcement (foreign keys) and integrity (constraints), requiring complex queries/transactions across different entities. <br/><em>Examples: Financial records, ERP systems, user accounts with detailed permissions, normalized inventory.</em></li> + <li><strong>Document Model (MongoDB):</strong> Ideal for semi-structured data where entities (documents) can have varying attributes, or data naturally nests (e.g., orders with line items). Good when data maps closely to application objects. <br/><em>Examples: Content management, product catalogs, user profiles, session data, event logs.</em></li> + <li><strong>Key-Value Model (Redis, Memcached):</strong> Suitable when the primary access pattern is retrieving a value based on a known unique key, requiring extremely low latency. <br/><em>Examples: Caching layers, session state, real-time counters, feature flags.</em></li> + <li><strong>Wide-Column Model (Cassandra, HBase):</strong> Good for handling massive datasets where access is typically by row key (partition key), but the columns needed per row might vary or be very numerous (sparse). Optimized for writes and range scans within partitions. <br/><em>Examples: Time-series data, IoT sensor readings, activity logs, user event tracking.</em></li> + <li><strong>Graph Model (Neo4j, Neptune):</strong> The best choice when the *relationships* between data points are as important as the data itself, and queries involve traversing these connections (e.g., path finding, centrality, pattern matching). <br/><em>Examples: Social networks, fraud detection rings, recommendation engines, knowledge graphs, dependency analysis.</em></li> + <li><strong>Search Model (Elasticsearch, OpenSearch, Solr):</strong> Optimal for text-heavy data requiring sophisticated search capabilities (relevance ranking, faceting, fuzzy matching, geospatial) or complex aggregations for near real-time analytics. <br/><em>Examples: Log analysis, product search, document repositories, SIEM data, application monitoring.</em></li> + <li><strong>Vector Model (Milvus, Pinecone, etc.):</strong> Specifically designed for storing and querying high-dimensional vector embeddings based on similarity.<br/><em>Examples: Semantic search, RAG for LLMs, image/audio retrieval by content.</em></li> </ul> - <p>Trying to force data into an unsuitable model often leads to complexity and poor performance (e.g., simulating complex relationships in a Key-Value store, or storing large binary blobs in a relational database optimized for structured data).</p> + <p>Trying to force data into an unsuitable model (e.g., complex relations in K/V, full-text search in basic SQL, tabular data in graph) often leads to convoluted application logic, poor performance, and scaling difficulties.</p> </div> </div> </div> @@ -946,36 +1269,37 @@ <div class="col-12"> <div class="info-card db-type-choosing" id="card-polyglot"> <div class="card-body"> - <h5 class="text-center"><i class="bi bi-check2-circle"></i> Selection & Polyglot Persistence</h5> + <h5 class="text-center"><i class="bi bi-check2-circle"></i> Selection Criteria & Polyglot Persistence</h5> <div class="card-content-wrapper"> - <p class="summary text-center">Consider consistency, structure, queries, scale, availability, schema needs, and operations. Modern systems often use <span class="term">multiple database types</span> (Polyglot Persistence) for different tasks.</p> + <p class="summary text-center">Evaluate based on: Consistency needs (ACID/BASE), Data Structure & Model Fit, Query Patterns & Complexity, Scalability requirements (Read/Write, Horizontal/Vertical), Availability & Fault Tolerance (CAP/PACELC), Schema Volatility, Operational Burden (Self-managed vs Cloud), and Team Expertise. Modern systems often adopt <span class="term">Polyglot Persistence</span>, using multiple specialized databases.</p> <button class="btn btn-sm details-toggle mx-auto" type="button" data-bs-toggle="collapse" data-bs-target="#collapseChoosing" aria-expanded="false" aria-controls="collapseChoosing"> Key Questions & Strategy <i class="bi bi-chevron-down"></i> </button> </div> </div> <div class="collapse collapse-content" id="collapseChoosing"> - <h6>Key Questions to Ask:</h6> + <h6>Key Decision Factors (Checklist for Architects):</h6> <ul> - <li><strong>Consistency Needs:</strong> Is immediate, strong consistency (ACID) required for all operations, or is eventual consistency (BASE) acceptable for some/all parts of the application?</li> - <li><strong>Data Structure:</strong> Is the data highly structured and relational? Semi-structured (like JSON documents)? Simple key-value pairs? Highly interconnected (graph)? Primarily text for searching?</li> - <li><strong>Query Patterns:</strong> What are the most frequent and critical queries? Do they involve complex joins across multiple tables? Simple key lookups? Full-text search? Traversing relationships? Aggregations?</li> - <li><strong>Scalability Requirements:</strong> What is the expected data volume and request load (reads vs. writes)? How critical is seamless horizontal scaling?</li> - <li><strong>Availability Needs:</strong> What level of uptime is required? Can the system tolerate brief unavailability during network issues to ensure consistency (CP), or must it remain available even if data might be temporarily stale (AP)?</li> - <li><strong>Schema Evolution:</strong> How often is the data structure expected to change? Is schema flexibility a major advantage, or is enforcing structure more important?</li> - <li><strong>Operational Capacity:</strong> What is the team's expertise? What is the budget for licensing, hardware/cloud resources, and operational overhead (management, monitoring, backups)? Are managed services an option?</li> - <li><strong>Ecosystem & Tooling:</strong> How well does the database integrate with existing languages, frameworks, and infrastructure? How strong is the community support?</li> + <li><strong>Consistency Requirements:</strong> Is strong, immediate consistency (ACID) mandatory for critical operations, or is eventual consistency (BASE) acceptable for performance/availability gains? What isolation level is needed?</li> + <li><strong>Data Model Fit:</strong> Does the data naturally map to relations (tables), documents, key-value pairs, graph structures, time-series, text indices, or vector embeddings? (See "Data Model Fit" tradeoff).</li> + <li><strong>Query Patterns:</strong> What are the dominant read/write patterns? Simple lookups? Complex analytical queries? Range scans? Full-text search? Graph traversals? Similarity searches? What are the latency requirements for these queries?</li> + <li><strong>Scalability Needs:</strong> What is the current and projected data volume (GB/TB/PB)? What is the expected request throughput (RPS/QPS) for reads and writes? Is seamless horizontal scaling a primary requirement, or is vertical scaling sufficient initially?</li> + <li><strong>Availability & Fault Tolerance:</strong> What RPO (Recovery Point Objective - max data loss) and RTO (Recovery Time Objective - max downtime) are acceptable? Does the system need multi-region or multi-datacenter redundancy? How does the system need to behave during network partitions (CP vs AP)?</li> + <li><strong>Schema Evolution:</strong> How stable is the data schema? Is rapid iteration with frequent schema changes expected, favoring flexibility (Schema-on-Read), or is stability and enforced structure more important (Schema-on-Write)?</li> + <li><strong>Operational Considerations (Self-Managed Bias):</strong> What is the team's operational expertise with the candidate database(s)? What are the requirements for backups, monitoring, upgrades, security patching, and cluster management? What are the infrastructure costs (hardware, network)? Compare this to the cost/constraints of managed services if applicable.</li> + <li><strong>Ecosystem & Team Skills:</strong> How well does the database integrate with the existing tech stack (languages, frameworks)? Is there good community support, documentation, and available talent?</li> </ul> <h6>Polyglot Persistence Strategy</h6> - <p>Recognize that <span class="term">no single database is best for every task</span>. Modern complex applications often benefit from using multiple database technologies concurrently, choosing the best tool for each specific job within the application architecture.</p> - <p><em>Example Scenario:</em> An e-commerce site might use:</p> + <p>Recognize that <span class="term">no single database excels at everything</span>. Complex applications often benefit from using multiple database technologies concurrently, selecting the best tool for each specific job based on the criteria above.</p> + <p><em>Example Scenario (E-commerce revisited):</em></p> <ul> - <li>A <span class="term">Relational Database (e.g., PostgreSQL)</span> for core transactional data like orders, user accounts, and billing (requiring ACID).</li> - <li>A <span class="term">Search Engine (e.g., Elasticsearch)</span> for product catalog search and filtering.</li> - <li>A <span class="term">Key-Value Store (e.g., Redis)</span> for caching user sessions and frequently accessed data.</li> - <li>Perhaps a <span class="term">Graph Database (e.g., Neo4j)</span> for generating product recommendations ("users who bought this also bought...").</li> + <li><span class="term">PostgreSQL (SQL):</span> Core user accounts, orders, billing, inventory (ACID transactions needed).</li> + <li><span class="term">Elasticsearch (Search):</span> Product catalog search, filtering, faceting, log analysis for operational insights.</li> + <li><span class="term">Redis (Key-Value):</span> User session caching, shopping cart persistence (fast access, tolerance for minor loss ok), rate limiting.</li> + <li><span class="term">ClickHouse (Modern/OLAP):</span> Analyzing sales trends, user behavior for BI dashboards (fast analytics on large volumes).</li> + <li><span class="term">Vector DB (e.g., Milvus):</span> Powering "related products" recommendations based on product embedding similarity.</li> </ul> - <p>This approach leverages the strengths of each database type but introduces complexity in managing multiple data stores and ensuring consistency between them where necessary.</p> + <p><strong>Challenge:</strong> This approach maximizes efficiency per task but introduces significant architectural and operational complexity: managing multiple systems, ensuring data consistency/synchronization between them where necessary (e.g., via event streams, ETL), and requiring broader team expertise.</p> </div> </div> </div> @@ -985,16 +1309,15 @@ </div> <!-- /container --> <footer class="container text-center"> - <p>© 2025 David Veksler</p> + <p>© 2025 David Veksler <!-- Or current year --></p> </footer> <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script> <script> -// Final Script incorporating fixes +// Final Script incorporating fixes and robust handling document.addEventListener('DOMContentLoaded', () => { const mainContainer = document.getElementById('main-container'); - const cards = document.querySelectorAll('.info-card'); // Select all cards again let currentHoverState = { card: null, line: null }; // Track active state // --- Debounce function --- @@ -1013,144 +1336,194 @@ document.addEventListener('DOMContentLoaded', () => { // --- Get Computed Color --- function getElementColor(element) { if (!element) return '#6b7280'; // Default concept color - return window.getComputedStyle(element).getPropertyValue('--db-category-color').trim() || '#6b7280'; + // Check the element itself, then its closest card, then its section for the color var + const card = element.closest('.info-card') || element; + const section = element.closest('.schema-container'); + let color = window.getComputedStyle(card).getPropertyValue('--db-category-color').trim(); + if (!color || color === 'initial' || color === 'inherit') { + color = section ? window.getComputedStyle(section).getPropertyValue('--db-category-color').trim() : '#6b7280'; + } + return color || '#6b7280'; } // --- Clear Hover State --- function clearHoverState(forceClear = false) { + // Check if the mouse is still physically over the currently highlighted card + // This prevents flickering when moving slightly within the card boundaries const isMouseStillOverCard = currentHoverState.card && currentHoverState.card.matches(':hover'); - // Clear only if forced, or if mouse left the currently hovered card + // Clear only if forced, or if the mouse truly left the currently hovered card if (forceClear || !isMouseStillOverCard) { if (currentHoverState.line) { - try { currentHoverState.line.remove(); } catch (e) { /* Ignore */ } + try { currentHoverState.line.remove(); } catch (e) { console.warn("Error removing line:", e); } + currentHoverState.line = null; } mainContainer.classList.remove('is-dimmed'); - // Remove highlight from potentially previously highlighted card - if(currentHoverState.card) { + + // Remove highlight from the previously highlighted card + if (currentHoverState.card) { currentHoverState.card.classList.remove('is-highlighted'); const oldSchemaContainer = currentHoverState.card.closest('.schema-container'); if (oldSchemaContainer) { oldSchemaContainer.classList.remove('is-highlighted-section'); - const oldTitle = oldSchemaContainer.querySelector('.section-title'); - if (oldTitle) oldTitle.style.opacity = ''; // Reset opacity + // No need to reset title opacity explicitly, handled by general dimming rules } + currentHoverState.card = null; } - currentHoverState = { card: null, line: null }; } } // --- Apply Hover State --- function applyHoverState(card) { - if (!card || card === currentHoverState.card) return; // Already handling this card + if (!card || card === currentHoverState.card) return; // No change or already handling - clearHoverState(true); // Force clear previous state before applying new one + // Force clear previous state before applying new one to prevent overlaps/ghosts + clearHoverState(true); const schemaContainer = card.closest('.schema-container'); const sectionHeader = schemaContainer ? schemaContainer.querySelector('.section-title') : null; if (!sectionHeader || !card.id || !sectionHeader.id) { console.warn("Missing ID on card or section header, cannot draw line for:", card); - return; // Need IDs for LeaderLine + // Still apply visual highlight even if line can't be drawn + mainContainer.classList.add('is-dimmed'); + card.classList.add('is-highlighted'); + if (schemaContainer) { + schemaContainer.classList.add('is-highlighted-section'); + } + currentHoverState.card = card; // Track the card even without a line + return; } - // Set new state + // Set new state visually currentHoverState.card = card; mainContainer.classList.add('is-dimmed'); card.classList.add('is-highlighted'); if (schemaContainer) { schemaContainer.classList.add('is-highlighted-section'); - const title = schemaContainer.querySelector('.section-title'); - if (title) title.style.opacity = '1'; // Ensure title is fully visible } - // Draw Line + // Draw Line - Using element IDs is preferred for LeaderLine try { + const startElement = document.getElementById(sectionHeader.id); + const endElement = document.getElementById(card.id); + if (!startElement || !endElement) { + throw new Error(`Could not find elements for LeaderLine: ${sectionHeader.id} or ${card.id}`); + } + const cardColor = getElementColor(card); - // Using element IDs is generally reliable const line = new LeaderLine( - document.getElementById(sectionHeader.id), - document.getElementById(card.id), + startElement, + endElement, { color: cardColor, size: 2, - path: 'fluid', + path: 'fluid', // 'straight', 'arc', 'fluid', 'grid', 'magnet' startSocket: 'bottom', endSocket: 'top', - dash: { animation: true, len: 6, gap: 3 }, + startSocketGravity: [0, -30], // Nudge start point down slightly from header center + endSocketGravity: [0, 30], // Nudge end point up slightly from card center + dash: { animation: true, len: 8, gap: 4 }, + // endPlug: 'arrow1', // Optional: add an arrowhead + // middleLabel: LeaderLine.pathLabel({ text: 'Connects', color: cardColor }), // Optional label } ); - currentHoverState.line = line; + currentHoverState.line = line; // Store the line instance } catch (e) { - console.error("LeaderLine error:", e, "From:", sectionHeader.id, "To:", card.id); + console.error("LeaderLine error:", e); + // Ensure hover state is cleared if line fails, prevents broken state clearHoverState(true); } } - // --- Event Listeners using Event Delegation --- + // --- Event Listeners using Event Delegation on main container --- mainContainer.addEventListener('mouseover', (event) => { const targetCard = event.target.closest('.info-card'); + // Apply hover state only if we enter a *new* card if (targetCard && targetCard !== currentHoverState.card) { applyHoverState(targetCard); } }); mainContainer.addEventListener('mouseout', (event) => { - // Check if moving outside the main container entirely - if (!mainContainer.contains(event.relatedTarget)) { - clearHoverState(true); - } - // Check if moving from a card to a non-card area within the container - else if (event.target.classList.contains('info-card') && !event.relatedTarget?.closest('.info-card')) { - // Delay slightly to see if we immediately enter another card + const relatedTarget = event.relatedTarget; + const currentCard = currentHoverState.card; + + // Check if mouse left the currently hovered card *and* didn't enter another card immediately + if (currentCard && event.target === currentCard && !currentCard.contains(relatedTarget) && !relatedTarget?.closest('.info-card')) { + // Use a small delay to handle brief movements between elements / gaps setTimeout(() => { - const isOverAnotherCard = mainContainer.querySelector('.info-card:hover'); - if (!isOverAnotherCard) { // If not over another card after delay, clear - clearHoverState(true); + // Re-check if mouse is *still* not over any card after delay + if (!mainContainer.querySelector('.info-card:hover')) { + clearHoverState(false); // Use non-forced clear initially } - }, 50); // Increased delay slightly + }, 50); + } + // Handle leaving the main container entirely + else if (!mainContainer.contains(relatedTarget)) { + clearHoverState(true); // Force clear if leaving container } }); - // --- Update lines on resize --- + + // --- Update lines on resize and scroll --- const positionLines = debounce(() => { if (currentHoverState.line) { - try { currentHoverState.line.position(); } - catch (e) { console.warn("Reposition error", e); clearHoverState(true); } + try { + // Check if the line instance still exists and has a position method + if (currentHoverState.line && typeof currentHoverState.line.position === 'function') { + currentHoverState.line.position(); + } else { + // Line might have been removed unexpectedly, clear state + clearHoverState(true); + } + } + catch (e) { + console.warn("Reposition error:", e); + clearHoverState(true); // Clear state if reposition fails + } } - }, 150); // Restore debounce for potentially many elements + }, 100); // Adjust debounce timing as needed window.addEventListener('resize', positionLines); + window.addEventListener('scroll', positionLines, { passive: true }); // Also position on scroll // --- Toggle Chevron Icons on Collapse Buttons --- - const collapseToggles = document.querySelectorAll('.details-toggle'); - collapseToggles.forEach(button => { - const targetId = button.getAttribute('data-bs-target'); - const targetCollapse = document.querySelector(targetId); - const icon = button.querySelector('.bi'); - - if (targetCollapse && icon) { - // Initial state - if (targetCollapse.classList.contains('show')) { - icon.classList.remove('bi-chevron-down'); icon.classList.add('bi-chevron-up'); + const collapseElements = document.querySelectorAll('.collapse'); + collapseElements.forEach(collapseEl => { + const button = document.querySelector(`.details-toggle[data-bs-target="#${collapseEl.id}"]`); + const icon = button ? button.querySelector('.bi') : null; + + if (button && icon) { + // Initial state check (in case some are open by default) + if (collapseEl.classList.contains('show')) { + icon.classList.remove('bi-chevron-down'); + icon.classList.add('bi-chevron-up'); button.setAttribute('aria-expanded', 'true'); } else { - icon.classList.remove('bi-chevron-up'); icon.classList.add('bi-chevron-down'); + icon.classList.remove('bi-chevron-up'); + icon.classList.add('bi-chevron-down'); button.setAttribute('aria-expanded', 'false'); } - // Use Bootstrap events - targetCollapse.addEventListener('shown.bs.collapse', positionLines); - targetCollapse.addEventListener('hidden.bs.collapse', positionLines); - targetCollapse.addEventListener('show.bs.collapse', () => { - icon.classList.remove('bi-chevron-down'); icon.classList.add('bi-chevron-up'); - // Reposition slightly after animation starts helps leader-line cope + // Use Bootstrap events for reliability + collapseEl.addEventListener('show.bs.collapse', () => { + icon.classList.remove('bi-chevron-down'); + icon.classList.add('bi-chevron-up'); + button.setAttribute('aria-expanded', 'true'); + // Reposition lines slightly after animation starts setTimeout(positionLines, 50); }); - targetCollapse.addEventListener('hide.bs.collapse', () => { - icon.classList.remove('bi-chevron-up'); icon.classList.add('bi-chevron-down'); + collapseEl.addEventListener('shown.bs.collapse', positionLines); // Final position after animation + + collapseEl.addEventListener('hide.bs.collapse', () => { + icon.classList.remove('bi-chevron-up'); + icon.classList.add('bi-chevron-down'); + button.setAttribute('aria-expanded', 'false'); + // Reposition lines slightly after animation starts setTimeout(positionLines, 50); }); + collapseEl.addEventListener('hidden.bs.collapse', positionLines); // Final position after animation } });