diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a9bf9a2e..a5d23fa2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Releases prior to 7.0 has been removed from this file to declutter search result ### Fixed +- cli: Fixed crash when running `schema wipe` on empty or in-memory database. +- database: Fixed `dipdup_wipe` function not dropping all user-defined objects in some cases. - database: Fix exception when creating connections with aiosqlite==0.22.0. ## [8.5.1] - 2025-11-03 diff --git a/docs/12.faq.md b/docs/12.faq.md index 805c70607..0b1bc8bcd 100644 --- a/docs/12.faq.md +++ b/docs/12.faq.md @@ -28,44 +28,27 @@ DipDup is a Python framework for building indexing applications. It allows you t ### What hardware do I need to run DipDup? -DipDup can run on any amd64/arm64 machine that runs Python 3.12. Aim for good single-thread CPU performance and fast storage. +DipDup can run on any amd64/arm64 machine that runs Python. Aim for good single-threaded and disk I/O performance. You need at least 512M of RAM, but actual requirements can grow significantly depending on the number and complexity of indexes, the size of internal queues and caches, and `CachedModel` usage. ## Indexing -### What's the difference between "level" and "block number"? - -The term "level" is used in Tezos and some other blockchains as a synonym for "block height" or "block number". We use this term for historical reasons. - -### What does "head" mean? - -You may see the term "head" in logs, metrics and internal db tables. It refers to the latest known position in the blockchain, but with some nuances: - -For **datasources** it means the last block available from the datalake/node/API indexer knows about. It may differ from the actual chain head for various reasons: network latency, node sync status, rate limits, etc. - -For **indexes** it means the last block processed and stored in the database. It may lag behind the datasource head if indexing is still in progress or latest blocks have not triggered any handlers. - -What to use for monitoring depend on your use case. - ### How to index similar but not identical contracts as a single entity? Multiple contracts can provide the same interface but have different storage structures. Examples are ERC20/ERC721/ERC1155 standard tokens on Ethereum and FA1.2/FA2 ones on Tezos. If you try to use the same typename for them, indexing will fail because of the storage mismatch. However, you can modify typeclasses manually. Edit the `types//storage.py` file and comment out fields, leaving only the ones used in your index (common for all contracts with the same interface). ```python class ContractStorage(BaseModel): - model_config = ConfigDict( - extra='forbid', - ) + class Config: + extra = Extra.ignore common_ledger: dict[str, str] # unique_field_foo: str # unique_field_bar: str ``` -Don't forget the `model_config` field. - -To restore the original typeclass, remove the modified file and run `dipdup init` again. You can also add the `--force` flag to overwrite all ABIs and typeclasses. +Don't forget the `Extra.ignore` Pydantic hint; otherwise, storage deserialization will fail. To restore the original typeclass, remove the modified file and run `dipdup init` again. You can also add the `--force` flag to overwrite all ABIs and typeclasses. ### How to use off-chain datasources? @@ -194,11 +177,11 @@ It will update both the CLI tool and Python package. **tl;dr**: Just use `uv` for everything. -For historical reasons, Python package management is a mess. There are multiple tools and approaches to manage Python dependencies. **pip** is a general-purpose package manager. It's simple and robust, but only covers basic functionality. For a full-fledged project, you need a tool to handle virtual environments, lock files, dependency resolution, publishing, etc. Some of the most popular tools are: uv, Poetry, PDM, pip-tools ([performance benchmark](https://lincolnloop.github.io/python-package-manager-shootout/)). +For historical reasons, Python package management is a mess. There are multiple tools and approaches to manage Python dependencies. **pip** is a general-purpose package manager. It's simple and robust, but only covers basic functionality. For a full-fledged project, you need to use a tool to handle virtual environments, lock files, dependency resolution, publishing, etc. Some of the most popular tools are: uv, Poetry, PDM, Hatch and others. Starting with version 8.3, DipDup uses **uv** as the default package manager for both CLI installer and project management. This tool is extremely fast, reliable, and replaces the bulk of functionality provided by other tools. -Poetry and PDM integration in DipDup is deprecated and will be removed in future releases. To perform migration after updating to >8.3 run the following commands: +Poetry and PDM integration in DipDup is deprecated and will be removed in future releases. To perform a migration, run the following commands: ```shell [Terminal] rm *.lock diff --git a/src/dipdup/cli.py b/src/dipdup/cli.py index 62776966d..4f52bb5bc 100644 --- a/src/dipdup/cli.py +++ b/src/dipdup/cli.py @@ -827,6 +827,10 @@ async def schema_wipe(ctx: click.Context, immune: bool, force: bool) -> None: immune_tables = set() if immune else config.database.immune_tables if isinstance(config.database, SqliteDatabaseConfig): + if config.database.path == ':memory:': + _logger.warning('Attempted to wipe in-memory database; no action required') + return + message = 'Support for immune tables in SQLite is experimental and requires `advanced.unsafe_sqlite` flag set' if config.advanced.unsafe_sqlite: immune_tables = immune_tables | ALWAYS_IMMUNE_TABLES diff --git a/src/dipdup/database.py b/src/dipdup/database.py index 283300613..1f16cf36f 100644 --- a/src/dipdup/database.py +++ b/src/dipdup/database.py @@ -23,6 +23,7 @@ from tortoise.backends.base.executor import EXECUTOR_CACHE from tortoise.backends.sqlite.client import SqliteClient from tortoise.connection import connections +from tortoise.exceptions import OperationalError from tortoise.fields import DecimalField from tortoise.models import Model as TortoiseModel from tortoise.utils import get_schema_sql @@ -304,8 +305,16 @@ async def _pg_wipe_schema( for table in immune_tables: await _pg_move_table(conn, table, schema_name, immune_schema_name) - await conn.execute_script(f"SELECT dipdup_wipe('{schema_name}')") - + try: + await conn.execute_script(f"SELECT dipdup_wipe('{schema_name}')") + except OperationalError as e: + if 'function dipdup_wipe' not in str(e): + raise + _logger.warning('`dipdup_wipe` function is not defined in the database.') + _logger.info( + 'Either the schema is empty already or you are connecting to the wrong database. No actions were performed.' + ) + return if immune_tables: for table in immune_tables: await _pg_move_table(conn, table, immune_schema_name, schema_name) diff --git a/src/dipdup/sql/dipdup_wipe.sql b/src/dipdup/sql/dipdup_wipe.sql index dcaa2cc83..f43bcbc2e 100644 --- a/src/dipdup/sql/dipdup_wipe.sql +++ b/src/dipdup/sql/dipdup_wipe.sql @@ -1,95 +1,79 @@ --- Drops all user-defined objects (views, materialized views, tables, sequences, types, functions, TimescaleDB hypertables/chunks) --- in the specified schema. A complete schema wipe without dropping the schema itself. Afair this was implemented for compatibility --- with some cloud providers. +-- Drops all user-defined objects in the specified schema. A complete schema wipe without dropping the schema itself. +-- Affects views, materialized views, tables (including hypertables), sequences, composite types, functions, and procedures. +-- This functionality was implemented for compatibility with some cloud provider I can't remember, who doesn't allow dropping schemas directly. +-- Should work with PostgreSQL 11+ and TimescaleDB 2.0+. CREATE OR REPLACE FUNCTION dipdup_wipe(schema_name VARCHAR) RETURNS void AS $$ DECLARE rec RECORD; BEGIN -- Drop views FOR rec IN - SELECT 'DROP VIEW IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(viewname) || ' CASCADE;' - FROM pg_views - WHERE schemaname = schema_name + SELECT format('DROP VIEW IF EXISTS %I.%I CASCADE', schema_name, viewname) AS stmt + FROM pg_views WHERE schemaname = schema_name LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; -- Drop materialized views FOR rec IN - SELECT 'DROP MATERIALIZED VIEW IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(matviewname) || ' CASCADE;' - FROM pg_matviews - WHERE schemaname = schema_name + SELECT format('DROP MATERIALIZED VIEW IF EXISTS %I.%I CASCADE', schema_name, matviewname) AS stmt + FROM pg_matviews WHERE schemaname = schema_name LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; - -- Drop tables + -- Drop tables (includes hypertables; CASCADE handles chunks automatically) FOR rec IN - SELECT 'DROP TABLE IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(tablename) || ' CASCADE;' - FROM pg_tables - WHERE schemaname = schema_name + SELECT format('DROP TABLE IF EXISTS %I.%I CASCADE', schema_name, tablename) AS stmt + FROM pg_tables WHERE schemaname = schema_name LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; -- Drop sequences FOR rec IN - SELECT 'DROP SEQUENCE IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(sequencename) || ' CASCADE;' - FROM pg_sequences - WHERE schemaname = schema_name + SELECT format('DROP SEQUENCE IF EXISTS %I.%I CASCADE', schema_name, sequencename) AS stmt + FROM pg_sequences WHERE schemaname = schema_name LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; - -- Drop types + -- Drop composite types (excluding extension-owned) FOR rec IN - SELECT 'DROP TYPE IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(t.typname) || ' CASCADE;' + SELECT format('DROP TYPE IF EXISTS %I.%I CASCADE', schema_name, t.typname) AS stmt FROM pg_type t JOIN pg_namespace n ON n.oid = t.typnamespace - WHERE n.nspname = schema_name AND t.typtype = 'c' + WHERE n.nspname = schema_name + AND t.typtype IN ('c', 'e', 'd', 'r') + AND NOT EXISTS ( + SELECT 1 FROM pg_depend d + WHERE d.classid = 'pg_type'::regclass + AND d.objid = t.oid + AND d.deptype = 'e' + ) LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; - -- Drop functions + + -- Drop functions and procedures (excluding extension-owned) FOR rec IN - SELECT 'DROP FUNCTION IF EXISTS ' || quote_ident(schema_name) || '.' || quote_ident(p.proname) || '(' || oidvectortypes(p.proargtypes) || ') CASCADE;' + SELECT format('DROP ROUTINE IF EXISTS %I.%I(%s) CASCADE', + schema_name, p.proname, pg_get_function_identity_arguments(p.oid)) AS stmt FROM pg_proc p JOIN pg_namespace n ON n.oid = p.pronamespace WHERE n.nspname = schema_name + AND p.prokind IN ('f', 'p', 'a', 'w') + AND NOT EXISTS ( + SELECT 1 FROM pg_depend d + WHERE d.classid = 'pg_proc'::regclass + AND d.objid = p.oid + AND d.deptype = 'e' + ) LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; + EXECUTE rec.stmt; END LOOP; - -- Drop TimescaleDB hypertables and chunks (if any) - IF EXISTS (SELECT 1 FROM pg_class WHERE relname = 'hypertable' AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'timescaledb_information')) THEN - FOR rec IN - -- Use a very large interval ('10000 years') to ensure all TimescaleDB chunks are dropped, regardless of their age. - SELECT 'SELECT drop_chunks(interval ''10000 years'', ''' || quote_ident(schema_name) || '.' || quote_ident(table_name) || ''');' - FROM timescaledb_information.hypertables - WHERE table_schema = schema_name - LOOP - BEGIN - EXECUTE rec."?column?"; - EXCEPTION WHEN others THEN END; - END LOOP; - END IF; - - -- Drop all remaining objects (extensions, etc.) if needed - -- (Extensions are usually global, not per-schema, so not dropped here) - RETURN; END; $$ LANGUAGE plpgsql;