Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[x86_64, debug] topology_experimental_raft/test_tablets_removenode failed with #20926

Open
scylladb-promoter opened this issue Oct 1, 2024 · 0 comments
Labels
symptom/ci stability Issues that failed in ScyllaDB CI - tests and framework tests/flaky A problem with a test, having flaky behavior triage/oss

Comments

@scylladb-promoter
Copy link
Contributor

https://jenkins.scylladb.com/job/scylla-6.2/job/next/21/ failed with the following error:


=================================== FAILURES ===================================
________________________________ test_replace.2 ________________________________

manager = <test.pylib.manager_client.ManagerClient object at 0x7ff870c88080>

    @pytest.mark.asyncio
    async def test_replace(manager: ManagerClient):
        logger.info("Bootstrapping cluster")
        cmdline = [
            '--logger-log-level', 'storage_service=trace',
            '--logger-log-level', 'raft_topology=trace',
        ]
    
        servers = await manager.servers_add(3, cmdline=cmdline)
    
        cql = manager.get_cql()
    
        await create_keyspace(cql, "test", 32, rf=1)
        await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
    
        # We want RF=2 table to validate that quorum reads work after replacing node finishes
        # bootstrap which indicates that bootstrap waits for rebuilt.
        # Otherwise, some reads would fail to find a quorum.
        await create_keyspace(cql, "test2", 32, rf=2)
        await cql.run_async("CREATE TABLE test2.test (pk int PRIMARY KEY, c int);")
    
        await create_keyspace(cql, "test3", 32, rf=3)
        await cql.run_async("CREATE TABLE test3.test (pk int PRIMARY KEY, c int);")
        await cql.run_async("CREATE TABLE test3.test2 (pk int PRIMARY KEY, c int);")
    
        logger.info("Populating table")
    
        keys = range(256)
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test2.test (pk, c) VALUES ({k}, {k});") for k in keys])
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test3.test (pk, c) VALUES ({k}, {k});") for k in keys])
    
        async def check_ks(ks):
            logger.info(f"Checking {ks}")
            query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.QUORUM)
            rows = await cql.run_async(query)
            assert len(rows) == len(keys)
            for r in rows:
                assert r.c == r.pk
    
        async def check():
            # RF=1 keyspace will experience data loss so don't check it.
            # We include it in the test only to check that the system doesn't crash.
            await check_ks("test2")
            await check_ks("test3")
    
        await check()
    
        # Disable migrations concurrent with replace since we don't handle nodes going down during migration yet.
        # See https://github.com/scylladb/scylladb/issues/16527
        await manager.api.disable_tablet_balancing(servers[0].ip_addr)
    
        finish_writes = await start_writes(cql, "test3", "test2")
    
        logger.info('Replacing a node')
        await manager.server_stop(servers[0].server_id)
        replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = True)
        servers.append(await manager.server_add(replace_cfg))
        servers = servers[1:]
    
>       key_count = await finish_writes()

test/topology_experimental_raft/test_tablets_removenode.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
test/pylib/util.py:227: in finish
    await asyncio.gather(*tasks)
test/pylib/util.py:213: in do_writes
    raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

worker_id = 1

    async def do_writes(worker_id: int):
        write_count = 0
        while not stop_event.is_set():
            pk = key_gen.next_pk()
    
            # Once next_pk() is produced, key_gen.last_key() is assumed to be in the database
            # hence we can't give up on it.
            while True:
                try:
                    await cql.run_async(stmt, [pk, pk])
                    # Check read-your-writes
>                   rows = await cql.run_async(rd_stmt, [pk])
E                   cassandra.ReadTimeout: Error from server: code=1200 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out for test3.test2 - received only 1 responses from 2 CL=QUORUM." info={'consistency': 'QUORUM', 'required_responses': 2, 'received_responses': 1}

test/pylib/util.py:204: ReadTimeout
------------------------------ Captured log setup ------------------------------
@scylladb-promoter scylladb-promoter added symptom/ci stability Issues that failed in ScyllaDB CI - tests and framework tests/flaky A problem with a test, having flaky behavior triage/oss labels Oct 1, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
symptom/ci stability Issues that failed in ScyllaDB CI - tests and framework tests/flaky A problem with a test, having flaky behavior triage/oss
Projects
None yet
Development

No branches or pull requests

1 participant