Uploaded image for project: 'Percona Operator for MongoDB'
  1. Percona Operator for MongoDB
  2. K8SPSMDB-638

backup can fail with "starting deadline exceeded" even if it finishes in PBM

Details

    • Bug
    • Status: Open
    • Medium
    • Resolution: Unresolved
    • None
    • 1.16.0
    • None
    • Yes

    Description

      When running demand-backup-sharded test it can be seen that sometimes backup is marked with error status because "starting deadline exceeded", but PBM can actually start/finish the backup.

      I was running this from the main branch currently.

      Here's how it looks:

      NAME                CLUSTER     STORAGE      DESTINATION                  STATUS   COMPLETED   AGE
      backup-aws-s3       some-name   aws-s3       psmdb/2022-02-04T19:28:16Z   ready    16m         17m
      backup-azure-blob   some-name   azure-blob   psmdb/2022-02-04T19:29:30Z   error                17m
      backup-gcp-cs       some-name   gcp-cs       psmdb/2022-02-04T19:28:53Z   ready    15m         17m
      backup-minio        some-name   minio        2022-02-04T19:27:39Z         ready    16m         17m
      
      apiVersion: psmdb.percona.com/v1
      kind: PerconaServerMongoDBBackup
      metadata:
        annotations:
          kubectl.kubernetes.io/last-applied-configuration: |
            {"apiVersion":"psmdb.percona.com/v1","kind":"PerconaServerMongoDBBackup","metadata":{"annotations":{},"name":"backup-azure-blob","namespace":"demand-backup-sharded-20865"},"spec":{"clusterName":"some-name","storageName":"azure-blob"}}
        creationTimestamp: "2022-02-04T19:27:21Z"
        generation: 1
        name: backup-azure-blob
        namespace: demand-backup-sharded-20865
        resourceVersion: "5171863"
        uid: e366905d-52b7-4fdc-872e-7b6441637eae
      spec:
        clusterName: some-name
        storageName: azure-blob
      status:
        azure:
          container: operator-testing
          credentialsSecret: azure-secret
          prefix: psmdb
        destination: psmdb/2022-02-04T19:29:30Z
        error: starting deadline exceeded
        lastTransition: "2022-02-04T19:29:30Z"
        pbmName: "2022-02-04T19:29:30Z"
        s3:
          bucket: ""
          credentialsSecret: ""
          insecureSkipTLSVerify: false
        start: "2022-02-04T19:29:30Z"
        state: error
        storageName: azure-blob
      

      but in PBM backups collection that backup is marked as "done":

      {
          "_id" : ObjectId("61fd7e9a2e7a2737fcd8b70c"),
          "opid" : "61fd7e9a632f11bfa446b91c",
          "name" : "2022-02-04T19:29:30Z",
          "replsets" : [
              {
                  "name" : "rs2",
                  "dump_name" : "2022-02-04T19:29:30Z_rs2.dump.gz",
                  "oplog_name" : "2022-02-04T19:29:30Z_rs2.oplog.gz",
                  "start_ts" : NumberLong(1644002970),
                  "status" : "done",
                  "last_transition_ts" : NumberLong(1644003007),
                  "first_write_ts" : Timestamp(1644002968, 40),
                  "last_write_ts" : Timestamp(1644003004, 41),
                  "conditions" : [
                      {
                          "timestamp" : NumberLong(1644002970),
                          "status" : "running"
                      },
                      {
                          "timestamp" : NumberLong(1644003005),
                          "status" : "dumpDone"
                      },
                      {
                          "timestamp" : NumberLong(1644003007),
                          "status" : "done"
                      }
                  ],
                  "error" : ""
              },
              {
                  "name" : "rs1",
                  "dump_name" : "2022-02-04T19:29:30Z_rs1.dump.gz",
                  "oplog_name" : "2022-02-04T19:29:30Z_rs1.oplog.gz",
                  "start_ts" : NumberLong(1644002970),
                  "status" : "done",
                  "last_transition_ts" : NumberLong(1644003008),
                  "first_write_ts" : Timestamp(1644002970, 44),
                  "last_write_ts" : Timestamp(1644003005, 40),
                  "conditions" : [
                      {
                          "timestamp" : NumberLong(1644002970),
                          "status" : "running"
                      },
                      {
                          "timestamp" : NumberLong(1644003005),
                          "status" : "dumpDone"
                      },
                      {
                          "timestamp" : NumberLong(1644003008),
                          "status" : "done"
                      }
                  ],
                  "error" : ""
              },
              {
                  "name" : "rs0",
                  "dump_name" : "2022-02-04T19:29:30Z_rs0.dump.gz",
                  "oplog_name" : "2022-02-04T19:29:30Z_rs0.oplog.gz",
                  "start_ts" : NumberLong(1644002971),
                  "status" : "done",
                  "last_transition_ts" : NumberLong(1644003008),
                  "first_write_ts" : Timestamp(1644002970, 36),
                  "last_write_ts" : Timestamp(1644003005, 32),
                  "conditions" : [
                      {
                          "timestamp" : NumberLong(1644002971),
                          "status" : "running"
                      },
                      {
                          "timestamp" : NumberLong(1644003006),
                          "status" : "dumpDone"
                      },
                      {
                          "timestamp" : NumberLong(1644003008),
                          "status" : "done"
                      }
                  ],
                  "error" : ""
              },
              {
                  "name" : "cfg",
                  "dump_name" : "2022-02-04T19:29:30Z_cfg.dump.gz",
                  "oplog_name" : "2022-02-04T19:29:30Z_cfg.oplog.gz",
                  "start_ts" : NumberLong(1644002971),
                  "status" : "done",
                  "last_transition_ts" : NumberLong(1644003008),
                  "first_write_ts" : Timestamp(1644003001, 49),
                  "last_write_ts" : Timestamp(1644003005, 20),
                  "conditions" : [
                      {
                          "timestamp" : NumberLong(1644002971),
                          "status" : "running"
                      },
                      {
                          "timestamp" : NumberLong(1644003005),
                          "status" : "dumpDone"
                      },
                      {
                          "timestamp" : NumberLong(1644003008),
                          "status" : "done"
                      }
                  ],
                  "error" : ""
              }
          ],
          "compression" : "gzip",
          "store" : {
              "type" : "azure",
              "s3" : {
                  "region" : "",
                  "bucket" : "",
                  "credentials" : {
                      "access-key-id" : "",
                      "secret-access-key" : "",
                      "vault" : {
                          "server" : "",
                          "secret" : "",
                          "token" : ""
                      }
                  },
                  "insecureSkipTLSVerify" : false
              },
              "azure" : {
                  "account" : "k8soperators",
                  "container" : "operator-testing",
                  "prefix" : "psmdb",
                  "credentials" : {
                      "key" : "QahM09Py+TUL6rDiR8XGGywolXReKioEpjeD8w5zdAYNiVwK3cBha/LphyXSvviAUVSWvlxAXwV9w+I2ztcqTw=="
                  }
              },
              "filesystem" : {
                  "path" : ""
              }
          },
          "mongodb_version" : "4.4.10-11",
          "start_ts" : NumberLong(1644002970),
          "last_transition_ts" : NumberLong(1644003009),
          "first_write_ts" : Timestamp(1644002968, 40),
          "last_write_ts" : Timestamp(1644003005, 40),
          "hb" : Timestamp(1644003006, 41),
          "status" : "done",
          "conditions" : [
              {
                  "timestamp" : NumberLong(1644002970),
                  "status" : "starting"
              },
              {
                  "timestamp" : NumberLong(1644003003),
                  "status" : "running"
              },
              {
                  "timestamp" : NumberLong(1644003006),
                  "status" : "dumpDone"
              },
              {
                  "timestamp" : NumberLong(1644003009),
                  "status" : "done"
              }
          ],
          "n" : [
              {
                  "rs" : "rs0",
                  "n" : [
                      "some-name-rs0-2.some-name-rs0.demand-backup-sharded-20865.svc.cluster.local:27017",
                      "some-name-rs0-1.some-name-rs0.demand-backup-sharded-20865.svc.cluster.local:27017"
                  ],
                  "ack" : "some-name-rs0-1.some-name-rs0.demand-backup-sharded-20865.svc.cluster.local:27017"
              },
              {
                  "rs" : "rs2",
                  "n" : [
                      "some-name-rs2-2.some-name-rs2.demand-backup-sharded-20865.svc.cluster.local:27017",
                      "some-name-rs2-0.some-name-rs2.demand-backup-sharded-20865.svc.cluster.local:27017"
                  ],
                  "ack" : "some-name-rs2-0.some-name-rs2.demand-backup-sharded-20865.svc.cluster.local:27017"
              },
              {
                  "rs" : "rs1",
                  "n" : [
                      "some-name-rs1-2.some-name-rs1.demand-backup-sharded-20865.svc.cluster.local:27017",
                      "some-name-rs1-1.some-name-rs1.demand-backup-sharded-20865.svc.cluster.local:27017"
                  ],
                  "ack" : "some-name-rs1-2.some-name-rs1.demand-backup-sharded-20865.svc.cluster.local:27017"
              },
              {
                  "rs" : "cfg",
                  "n" : [
                      "some-name-cfg-1.some-name-cfg.demand-backup-sharded-20865.svc.cluster.local:27017",
                      "some-name-cfg-2.some-name-cfg.demand-backup-sharded-20865.svc.cluster.local:27017",
                      "some-name-cfg-0.some-name-cfg.demand-backup-sharded-20865.svc.cluster.local:27017"
                  ],
                  "ack" : "some-name-cfg-1.some-name-cfg.demand-backup-sharded-20865.svc.cluster.local:27017"
              }
          ],
          "pbm_version" : "1.6.1",
          "balancer" : "full",
          "error" : ""
      }
      

      full logs: K8SPSMDB-638.tar.gz

      Attachments

        Issue Links

          Activity

            People

              andrii.dema Andrii Dema
              tomislav.plavcic@percona.com Tomislav Plavcic
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:

                Smart Checklist