Skip to content

[x86_64, debug] topology_experimental_raft/test_tablets_removenode failed with  #20926

Open
@scylladb-promoter

Description

@scylladb-promoter

https://jenkins.scylladb.com/job/scylla-6.2/job/next/21/ failed with the following error:


=================================== FAILURES ===================================
________________________________ test_replace.2 ________________________________

manager = <test.pylib.manager_client.ManagerClient object at 0x7ff870c88080>

    @pytest.mark.asyncio
    async def test_replace(manager: ManagerClient):
        logger.info("Bootstrapping cluster")
        cmdline = [
            '--logger-log-level', 'storage_service=trace',
            '--logger-log-level', 'raft_topology=trace',
        ]
    
        servers = await manager.servers_add(3, cmdline=cmdline)
    
        cql = manager.get_cql()
    
        await create_keyspace(cql, "test", 32, rf=1)
        await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
    
        # We want RF=2 table to validate that quorum reads work after replacing node finishes
        # bootstrap which indicates that bootstrap waits for rebuilt.
        # Otherwise, some reads would fail to find a quorum.
        await create_keyspace(cql, "test2", 32, rf=2)
        await cql.run_async("CREATE TABLE test2.test (pk int PRIMARY KEY, c int);")
    
        await create_keyspace(cql, "test3", 32, rf=3)
        await cql.run_async("CREATE TABLE test3.test (pk int PRIMARY KEY, c int);")
        await cql.run_async("CREATE TABLE test3.test2 (pk int PRIMARY KEY, c int);")
    
        logger.info("Populating table")
    
        keys = range(256)
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test2.test (pk, c) VALUES ({k}, {k});") for k in keys])
        await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test3.test (pk, c) VALUES ({k}, {k});") for k in keys])
    
        async def check_ks(ks):
            logger.info(f"Checking {ks}")
            query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.QUORUM)
            rows = await cql.run_async(query)
            assert len(rows) == len(keys)
            for r in rows:
                assert r.c == r.pk
    
        async def check():
            # RF=1 keyspace will experience data loss so don't check it.
            # We include it in the test only to check that the system doesn't crash.
            await check_ks("test2")
            await check_ks("test3")
    
        await check()
    
        # Disable migrations concurrent with replace since we don't handle nodes going down during migration yet.
        # See https://github.com/scylladb/scylladb/issues/16527
        await manager.api.disable_tablet_balancing(servers[0].ip_addr)
    
        finish_writes = await start_writes(cql, "test3", "test2")
    
        logger.info('Replacing a node')
        await manager.server_stop(servers[0].server_id)
        replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = True)
        servers.append(await manager.server_add(replace_cfg))
        servers = servers[1:]
    
>       key_count = await finish_writes()

test/topology_experimental_raft/test_tablets_removenode.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
test/pylib/util.py:227: in finish
    await asyncio.gather(*tasks)
test/pylib/util.py:213: in do_writes
    raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

worker_id = 1

    async def do_writes(worker_id: int):
        write_count = 0
        while not stop_event.is_set():
            pk = key_gen.next_pk()
    
            # Once next_pk() is produced, key_gen.last_key() is assumed to be in the database
            # hence we can't give up on it.
            while True:
                try:
                    await cql.run_async(stmt, [pk, pk])
                    # Check read-your-writes
>                   rows = await cql.run_async(rd_stmt, [pk])
E                   cassandra.ReadTimeout: Error from server: code=1200 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out for test3.test2 - received only 1 responses from 2 CL=QUORUM." info={'consistency': 'QUORUM', 'required_responses': 2, 'received_responses': 1}

test/pylib/util.py:204: ReadTimeout
------------------------------ Captured log setup ------------------------------

Metadata

Metadata

Assignees

Labels

P2High Prioritytests/flakyA problem with a test, having flaky behaviortriage/sa

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions