Skip to content

Bug Report: Query planner breaks join with derived table containing a subquery with IN/NOT IN #17867

Closed
@arthurschreiber

Description

@arthurschreiber

Overview of the Issue

We have run into a query that gets incorrectly broken up by the Query Planner instead of fully being pushed down to MySQL.

Here's a simplified version of that query (using the vschema located at go/vt/vtgate/planbuilder/testdata/vschemas/schema.json):

SELECT m1.id
FROM music as m1
JOIN (
  SELECT max(id) as id
  FROM music
  WHERE music.user_id = 1234 AND music.foobar NOT IN (
    SELECT foobar FROM user_extra WHERE user_extra.user_id = 1234
  )
) as m2 ON m1.id = m2.id

The derived table m2 has enough information to be routed to a specific shard (user_extra and music have conditions on user_id, which is a hash vindex). The join between m1 and m2 happens on a shared lookup vindex.

The above mentioned conditions should be enough to have the query fully merged by the query planner and pushed down completely to MySQL. Instead, the query is broken up. Here's the query plan (taken on main):

{
  "QueryType": "SELECT",
  "Original": "SELECT m1.id FROM music as m1 JOIN (SELECT max(id) as id FROM music WHERE music.user_id = 1234 AND music.foobar NOT IN (SELECT foobar FROM user_extra WHERE user_extra.user_id = 1234)) as m2 ON m1.id = m2.id",
  "Instructions": {
    "OperatorType": "Join",
    "Variant": "Join",
    "JoinColumnIndexes": "R:0",
    "JoinVars": {
      "m2_id": 0
    },
    "TableName": "music_music",
    "Inputs": [
      {
        "OperatorType": "Route",
        "Variant": "EqualUnique",
        "Keyspace": {
          "Name": "user",
          "Sharded": true
        },
        "FieldQuery": "select m2.id from (select max(id) as id from music where 1 != 1) as m2 where 1 != 1",
        "Query": "select m2.id from (select max(id) as id from music where music.user_id = 1234 and music.foobar not in (select foobar from user_extra where user_extra.user_id = 1234)) as m2",
        "Table": "music",
        "Values": [
          "1234"
        ],
        "Vindex": "user_index"
      },
      {
        "OperatorType": "VindexLookup",
        "Variant": "EqualUnique",
        "Keyspace": {
          "Name": "user",
          "Sharded": true
        },
        "Values": [
          ":m2_id"
        ],
        "Vindex": "music_user_map",
        "Inputs": [
          {
            "OperatorType": "Route",
            "Variant": "IN",
            "Keyspace": {
              "Name": "user",
              "Sharded": true
            },
            "FieldQuery": "select `name`, keyspace_id from name_user_vdx where 1 != 1",
            "Query": "select `name`, keyspace_id from name_user_vdx where `name` in ::__vals",
            "Table": "name_user_vdx",
            "Values": [
              "::name"
            ],
            "Vindex": "user_index"
          },
          {
            "OperatorType": "Route",
            "Variant": "ByDestination",
            "Keyspace": {
              "Name": "user",
              "Sharded": true
            },
            "FieldQuery": "select m1.id from music as m1 where 1 != 1",
            "Query": "select m1.id from music as m1 where m1.id = :m2_id",
            "Table": "music"
          }
        ]
      }
    ]
  },
  "TablesUsed": [
    "user.music",
    "user.user_extra"
  ]
}

Variations of this query get merged correctly. For example, a derived table with a nested join:

SELECT m1.id
FROM music as m1
JOIN (
  SELECT max(id) as id
  FROM music
  INNER JOIN user_extra ue ON music.user_id = ue.user_id AND music.foobar = ue.foobar WHERE music.user_id = 1234
) as m2 ON m1.id = m2.id
{
  "QueryType": "SELECT",
  "Original": "SELECT m1.id FROM music as m1 JOIN (SELECT max(id) as id FROM music INNER JOIN user_extra ue ON music.user_id = ue.user_id AND music.foobar = ue.foobar WHERE music.user_id = 1234) as m2 ON m1.id = m2.id",
  "Instructions": {
    "OperatorType": "Route",
    "Variant": "EqualUnique",
    "Keyspace": {
      "Name": "user",
      "Sharded": true
    },
    "FieldQuery": "select m1.id from (select max(id) as id from music, user_extra as ue where 1 != 1) as m2, music as m1 where 1 != 1",
    "Query": "select m1.id from (select max(id) as id from music, user_extra as ue where music.user_id = 1234 and music.user_id = ue.user_id and music.foobar = ue.foobar) as m2, music as m1 where m1.id = m2.id",
    "Table": "music, user_extra",
    "Values": [
      "1234"
    ],
    "Vindex": "user_index"
  },
  "TablesUsed": [
    "user.music",
    "user.user_extra"
  ]
}

The query for the derived table gets merged correctly:

SELECT max(id) as id
FROM music
WHERE music.user_id = 1234 AND music.foobar NOT IN (
  SELECT foobar FROM user_extra WHERE user_extra.user_id = 1234
)
{
  "QueryType": "SELECT",
  "Original": "SELECT max(id) as id FROM music WHERE music.user_id = 1234 AND music.foobar NOT IN (SELECT foobar FROM user_extra WHERE user_extra.user_id = 1234)",
  "Instructions": {
    "OperatorType": "Route",
    "Variant": "EqualUnique",
    "Keyspace": {
      "Name": "user",
      "Sharded": true
    },
    "FieldQuery": "select max(id) as id from music where 1 != 1",
    "Query": "select max(id) as id from music where music.user_id = 1234 and music.foobar not in (select foobar from user_extra where user_extra.user_id = 1234)",
    "Table": "music",
    "Values": [
      "1234"
    ],
    "Vindex": "user_index"
  },
  "TablesUsed": [
    "user.music",
    "user.user_extra"
  ]
}

And a join with a derived table without a subquery works fine too:

SELECT m1.id
FROM music as m1
JOIN (
  SELECT max(id) as id FROM music WHERE music.user_id = 1234
) as m2 ON m1.id = m2.id
{
  "QueryType": "SELECT",
  "Original": "SELECT m1.id FROM music as m1 JOIN (SELECT max(id) as id FROM music WHERE music.user_id = 1234) as m2 ON m1.id = m2.id",
  "Instructions": {
    "OperatorType": "Route",
    "Variant": "EqualUnique",
    "Keyspace": {
      "Name": "user",
      "Sharded": true
    },
    "FieldQuery": "select m1.id from (select max(id) as id from music where 1 != 1) as m2, music as m1 where 1 != 1",
    "Query": "select m1.id from (select max(id) as id from music where music.user_id = 1234) as m2, music as m1 where m1.id = m2.id",
    "Table": "music",
    "Values": [
      "1234"
    ],
    "Vindex": "user_index"
  },
  "TablesUsed": [
    "user.music"
  ]
}

Reproduction Steps

N/A

Binary Version

v19 - main

Operating System and Environment details

N/A

Log Fragments

No response

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions