Skip to content

job document count sometimes stays at 0 #3736

@seanstory

Description

@seanstory

Bug Description

Some sync jobs seem to not count indexed documents or their volume, despite the fact that documents are being ingested.

To Reproduce

  1. create a github connector. Use a valid PAT, and configure elastic/connectors
  2. Use the below mappings
  3. create the below pipeline
  4. configure the pipeline with the below request
  5. Run a sync, and see 0 index documents in the sync job record, but see that the attached index does have documents.
  6. be confused
Mapping
{
  "mappings": {
    "properties": {
      "title": { "type": "semantic_text" },
      "body": { "type": "semantic_text" },
      "description": { "type": "semantic_text" },
      "semantic_comments": { "type": "semantic_text" },
      "issue_comments": {
        "properties": {
          "body": { 
            "type": "text",
            "copy_to": "semantic_comments"
           },
          "author": {
            "properties": {
              "login": { "type": "keyword" }
            }
          }
        }
      },
      "reviews_comments": {
        "properties": {
          "body": { 
            "type": "text",
            "copy_to": "semantic_comments"
           },
          "comments": {
            "properties": {
              "body": { 
                "type": "text",
                "copy_to": "semantic_comments"
              }
            }
          },
          "author": { "type": "keyword" },
          "state": { "type": "keyword" }
        }
      },
      "labels_field": {
        "properties": {
          "name": { "type": "keyword" },
          "description": { "type": "text" }
        }
      },
      "author": {
        "properties": {
          "login": { "type": "keyword" }
        }
      },
      "assignees_list": {
        "properties": {
          "login": { "type": "keyword" }
        }
      },
      "requested_reviewers": {
        "properties": {
          "requestedReviewer": {
            "properties": {
              "login": { "type": "keyword" }
            }
          }
        }
      },
      "defaultBranchRef": {
        "properties": {
          "name": { "type": "keyword" }
        }
      },
      "primaryLanguage": {
        "properties": {
          "name": { "type": "keyword" }
        }
      },
      "state": { "type": "keyword" },
      "type": { "type": "keyword" },
      "url": { "type": "keyword" },
      "nameWithOwner": { "type": "keyword" }
    }
  }
}
pipeline
{
  "processors": [
      {
        "attachment": {
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'attachment' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "remove_binary": false,
          "field": "_attachment",
          "target_field": "_extracted_attachment",
          "description": "Extract text from binary attachments",
          "ignore_missing": true,
          "indexed_chars_field": "_attachment_indexed_chars",
          "if": "ctx?._extract_binary_content == true"
        }
      },
      {
        "set": {
          "ignore_empty_value": true,
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'set' with tag 'set_body' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "field": "body",
          "description": "Set any extracted text on the 'body' field",
          "tag": "set_body",
          "copy_from": "_extracted_attachment.content",
          "if": "ctx?._extract_binary_content == true"
        }
      },
      {
        "gsub": {
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'gsub' with tag 'remove_replacement_chars' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "field": "body",
          "pattern": "�",
          "description": "Remove unicode 'replacement' characters",
          "ignore_missing": true,
          "tag": "remove_replacement_chars",
          "replacement": "",
          "if": "ctx?._extract_binary_content == true"
        }
      },
      {
        "gsub": {
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'gsub' with tag 'remove_extra_whitespace' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "field": "body",
          "pattern": """\s+""",
          "description": "Squish whitespace",
          "ignore_missing": true,
          "tag": "remove_extra_whitespace",
          "replacement": " ",
          "if": "ctx?._reduce_whitespace == true"
        }
      },
      {
        "trim": {
          "description": "Trim leading and trailing whitespace",
          "ignore_missing": true,
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'trim' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "field": "body",
          "if": "ctx?._reduce_whitespace == true"
        }
      },
      {
        "remove": {
          "description": "Remove meta fields",
          "ignore_missing": true,
          "on_failure": [
            {
              "append": {
                "description": "Record error information",
                "field": "_ingestion_errors",
                "value": "Processor 'remove' with tag 'remove_meta_fields' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
              }
            }
          ],
          "tag": "remove_meta_fields",
          "field": [
            "_attachment",
            "_attachment_indexed_chars",
            "_extracted_attachment",
            "_extract_binary_content",
            "_reduce_whitespace",
            "_run_ml_inference"
          ]
        }
      },
  
    {
      "remove": {
        "field": [
          "_timestamp",
          "id",
          "number",
          "forkCount",
          "stargazerCount",
          "watchers",
          "isArchived",
          "isFork",
          "visibility",
          "mergedAt",
          "closedAt",
          "createdAt",
          "name"
        ],
        "ignore_missing": true
      }
    }
  ]
}
Set the pipeline
PUT _connector/<connector_id>/_pipeline
{
    "pipeline": {
        "extract_binary_content": true,
        "name": "github_pipeline",
        "reduce_whitespace": true,
        "run_ml_inference": false
    }
}

Expected behavior

There should be a corresponding count of indexed documents to the documents in the index

Screenshots

Image Image

Environment

9.2.0-SNAPSHOT (main, at time of writing)

Additional context

Slack thread: https://elastic.slack.com/archives/C01795T48LQ/p1759152996129649

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions