Skip to content

batches: retry changeset spec upload by default #705

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ All notable changes to `src-cli` are documented in this file.

### Changed

- Uploading changeset specs from `src batch apply` and `src batch preview` will now retry up to 5 times by default before failing. This can be controlled through the `-retries` flag. [#30431](https://github.com/sourcegraph/sourcegraph/issues/30431)

### Fixed

### Removed
Expand Down
7 changes: 6 additions & 1 deletion cmd/src/batch_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ type batchExecuteFlags struct {
file string
keepLogs bool
parallelism int
retries int
timeout time.Duration
workspace string
cleanArchives bool
Expand Down Expand Up @@ -122,6 +123,10 @@ func newBatchExecuteFlags(flagSet *flag.FlagSet, workspaceExecution bool, cacheD
&caf.parallelism, "j", runtime.GOMAXPROCS(0),
"The maximum number of parallel jobs. Default is GOMAXPROCS.",
)
flagSet.IntVar(
&caf.retries, "retries", 5,
"The maximum number of retries that will be made when sending a changeset spec.",
)
Comment on lines +126 to +129
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 99% of users would assume this controls retries for step command executions, not a small implementation detail in one of the many subsets of a batch spec execution.
I would probably prefer to be very explicit like -changeset-spec-upload-retries.

flagSet.DurationVar(
&caf.timeout, "timeout", 60*time.Minute,
"The maximum duration a single batch spec step can take.",
Expand Down Expand Up @@ -427,7 +432,7 @@ func executeBatchSpec(ctx context.Context, ui ui.ExecUI, opts executeBatchSpecOp
ui.UploadingChangesetSpecs(len(specs))

for i, spec := range specs {
id, err := svc.CreateChangesetSpec(ctx, spec)
id, err := svc.CreateChangesetSpec(ctx, spec, opts.flags.retries)
if err != nil {
return err
}
Expand Down
21 changes: 17 additions & 4 deletions internal/batches/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"path"
"strings"
"sync"
"time"

"github.com/grafana/regexp"

Expand All @@ -24,6 +25,7 @@ import (
"github.com/sourcegraph/src-cli/internal/batches/executor"
"github.com/sourcegraph/src-cli/internal/batches/graphql"
"github.com/sourcegraph/src-cli/internal/batches/repozip"
"github.com/sourcegraph/src-cli/internal/retrier"
)

type Service struct {
Expand Down Expand Up @@ -139,20 +141,31 @@ mutation CreateChangesetSpec($spec: String!) {
}
`

func (svc *Service) CreateChangesetSpec(ctx context.Context, spec *batcheslib.ChangesetSpec) (graphql.ChangesetSpecID, error) {
func (svc *Service) CreateChangesetSpec(ctx context.Context, spec *batcheslib.ChangesetSpec, maxRetries int) (graphql.ChangesetSpecID, error) {
return svc.createChangesetSpec(ctx, spec, maxRetries, 5*time.Second, 1.6)
}

func (svc *Service) createChangesetSpec(ctx context.Context, spec *batcheslib.ChangesetSpec, maxRetries int, retryWait time.Duration, multiplier float64) (graphql.ChangesetSpecID, error) {
raw, err := json.Marshal(spec)
if err != nil {
return "", errors.Wrap(err, "marshalling changeset spec JSON")
}

req := svc.newRequest(createChangesetSpecMutation, map[string]interface{}{
"spec": string(raw),
})
var result struct {
CreateChangesetSpec struct {
ID string
}
}
if ok, err := svc.newRequest(createChangesetSpecMutation, map[string]interface{}{
"spec": string(raw),
}).Do(ctx, &result); err != nil || !ok {
Comment on lines -153 to -155
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was a bug before, right? When ok is true and err is nil?

if err := retrier.Retry(func() error {
ok, err := req.Do(ctx, &result)
if !ok {
return errors.New("no data is available")
}
return err
}, maxRetries, retryWait, multiplier); err != nil {
return "", err
}

Expand Down
37 changes: 37 additions & 0 deletions internal/batches/service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -698,3 +698,40 @@ func TestEnsureDockerImages(t *testing.T) {

})
}

const testCreateChangesetSpecFailureResult = `
{
"data": {
"createChangesetSpec": {
"id": []
}
}
}
`

const testCreateChangesetSpecSuccessResult = `
{
"data": {
"createChangesetSpec": {
"id": "foo"
}
}
}
`

func TestCreateChangesetSpec(t *testing.T) {
ctx := context.Background()
spec := &batcheslib.ChangesetSpec{}

// Force a retry by sending a failure back first.
client, done := mockGraphQLClient(
testCreateChangesetSpecFailureResult,
testCreateChangesetSpecSuccessResult,
)
t.Cleanup(done)

svc := Service{client: client}
id, err := svc.createChangesetSpec(ctx, spec, 5, 0, 0)
assert.Nil(t, err)
assert.EqualValues(t, "foo", id)
}
32 changes: 32 additions & 0 deletions internal/retrier/retrier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package retrier

import (
"time"

"github.com/sourcegraph/sourcegraph/lib/errors"
)

// Retry retries the given function up to max times if it returns an error.
// base and multiplier may be specified to use an exponential backoff between
// retries; if no backoff is required, set both to 0.
//
// The first time the function returns nil, nil will immediately be returned
// from Retry.
func Retry(f func() error, max int, base time.Duration, multiplier float64) error {
var errs errors.MultiError
wait := base

for {
err := f()
if err == nil {
return nil
}

errs = errors.Append(errs, err)
if len(errs.Errors()) > max {
return errs
}
time.Sleep(wait)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this listen on context cancellations? Could (not tested) prevent immediate quits using SIGINTs during the upload stage

wait = time.Duration(multiplier * float64(wait))
}
}
89 changes: 89 additions & 0 deletions internal/retrier/retrier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package retrier

import (
"strconv"
"testing"
"time"

"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/stretchr/testify/assert"
)

func TestRetry(t *testing.T) {
knownErr := errors.New("error!")

t.Run("no backoff", func(t *testing.T) {
for i := 0; i < 100; i += 10 {
t.Run(strconv.Itoa(i), func(t *testing.T) {
t.Run("immediate success", func(t *testing.T) {
retries := -1
err := Retry(func() error {
retries += 1
return nil
}, i, 0, 0)

assert.Nil(t, err)
assert.Equal(t, 0, retries)
})

t.Run("delayed success", func(t *testing.T) {
retries := -1
want := i / 2
err := Retry(func() error {
retries += 1
if retries >= want {
return nil
}
return knownErr
}, i, 0, 0)

assert.Nil(t, err)
assert.Equal(t, want, retries)
})

t.Run("no success", func(t *testing.T) {
retries := -1
err := Retry(func() error {
retries += 1
return knownErr
}, i, 0, 0)

assert.NotNil(t, err)
assert.ErrorIs(t, err, knownErr)
assert.Equal(t, i, retries)
})
})
}
})

t.Run("backoff", func(t *testing.T) {
base := 1 * time.Millisecond
want := []time.Duration{
0, // We don't test anything on the first try.
base,
2 * base,
4 * base,
8 * base,
16 * base,
}

retries := -1
var last time.Time
err := Retry(func() error {
retries += 1
if retries > 0 {
now := time.Now()
assert.GreaterOrEqual(t, now.Sub(last), want[retries])
last = now
} else {
last = time.Now()
}

return knownErr
}, 5, base, 2)

assert.NotNil(t, err)
assert.ErrorIs(t, err, knownErr)
assert.Equal(t, 5, retries)
})
}