Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: skip evict-leader-scheduler when setting schedule deny label #8303

Merged
merged 31 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ coverage
*.txt
go.work*
embedded_assets_handler.go
*.log
27 changes: 27 additions & 0 deletions client/http/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
GetStore(context.Context, uint64) (*StoreInfo, error)
DeleteStore(context.Context, uint64) error
SetStoreLabels(context.Context, int64, map[string]string) error
DeleteStoreLabel(ctx context.Context, storeID int64, labelKey string) error
GetHealthStatus(context.Context) ([]Health, error)
/* Config-related interfaces */
GetConfig(context.Context) (map[string]any, error)
Expand All @@ -65,6 +66,7 @@
/* Scheduler-related interfaces */
GetSchedulers(context.Context) ([]string, error)
CreateScheduler(ctx context.Context, name string, storeID uint64) error
DeleteScheduler(ctx context.Context, name string) error
SetSchedulerDelay(context.Context, string, int64) error
/* Rule-related interfaces */
GetAllPlacementRuleBundles(context.Context) ([]*GroupBundle, error)
Expand All @@ -81,6 +83,10 @@
DeletePlacementRuleGroupByID(context.Context, string) error
GetAllRegionLabelRules(context.Context) ([]*LabelRule, error)
GetRegionLabelRulesByIDs(context.Context, []string) ([]*LabelRule, error)
// `SetRegionLabelRule` sets the label rule for a region.
// When a label rule (deny scheduler) is set,
// 1. All schedulers will be disabled except for the evict-leader-scheduler.
// 2. The merge-checker will be disabled, preventing these regions from being merged.
SetRegionLabelRule(context.Context, *LabelRule) error
PatchRegionLabelRules(context.Context, *LabelRulePatch) error
/* Scheduling-related interfaces */
Expand Down Expand Up @@ -339,6 +345,19 @@
WithBody(jsonInput))
}

// DeleteStoreLabel deletes the labels of a store.
func (c *client) DeleteStoreLabel(ctx context.Context, storeID int64, labelKey string) error {
jsonInput, err := json.Marshal(labelKey)
if err != nil {
return errors.Trace(err)

Check warning on line 352 in client/http/interface.go

View check run for this annotation

Codecov / codecov/patch

client/http/interface.go#L349-L352

Added lines #L349 - L352 were not covered by tests
}
return c.request(ctx, newRequestInfo().
WithName(deleteStoreLableName).
WithURI(LabelByStoreID(storeID)).
WithMethod(http.MethodDelete).
WithBody(jsonInput))

Check warning on line 358 in client/http/interface.go

View check run for this annotation

Codecov / codecov/patch

client/http/interface.go#L354-L358

Added lines #L354 - L358 were not covered by tests
}

// GetHealthStatus gets the health status of the cluster.
func (c *client) GetHealthStatus(ctx context.Context) ([]Health, error) {
var healths []Health
Expand Down Expand Up @@ -762,6 +781,14 @@
WithBody(inputJSON))
}

// DeleteScheduler deletes a scheduler from PD cluster.
func (c *client) DeleteScheduler(ctx context.Context, name string) error {
return c.request(ctx, newRequestInfo().
WithName(deleteSchedulerName).
WithURI(SchedulerByName(name)).
WithMethod(http.MethodDelete))

Check warning on line 789 in client/http/interface.go

View check run for this annotation

Codecov / codecov/patch

client/http/interface.go#L785-L789

Added lines #L785 - L789 were not covered by tests
}

// AccelerateSchedule accelerates the scheduling of the regions within the given key range.
// The keys in the key range should be encoded in the hex bytes format (without encoding to the UTF-8 bytes).
func (c *client) AccelerateSchedule(ctx context.Context, keyRange *KeyRange) error {
Expand Down
2 changes: 2 additions & 0 deletions client/http/request_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const (
getStoreName = "GetStore"
deleteStoreName = "DeleteStore"
setStoreLabelsName = "SetStoreLabels"
deleteStoreLableName = "DeleteStoreLabel"
okJiang marked this conversation as resolved.
Show resolved Hide resolved
getHealthStatusName = "GetHealthStatus"
getConfigName = "GetConfig"
setConfigName = "SetConfig"
Expand All @@ -53,6 +54,7 @@ const (
getReplicateConfigName = "GetReplicateConfig"
getSchedulersName = "GetSchedulers"
createSchedulerName = "CreateScheduler"
deleteSchedulerName = "DeleteScheduler"
setSchedulerDelayName = "SetSchedulerDelay"
getAllPlacementRuleBundlesName = "GetAllPlacementRuleBundles"
getPlacementRuleBundleByGroupName = "GetPlacementRuleBundleByGroup"
Expand Down
6 changes: 5 additions & 1 deletion pkg/schedule/schedulers/scheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ func (s *ScheduleController) Stop() {

// Schedule tries to create some operators.
func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator {
_, isEvictLeaderScheduler := s.Scheduler.(*evictLeaderScheduler)
retry:
for i := 0; i < maxScheduleRetries; i++ {
// no need to retry if schedule should stop to speed exit
Expand Down Expand Up @@ -486,7 +487,10 @@ retry:
if labelMgr == nil {
continue
}
if labelMgr.ScheduleDisabled(region) {

// If the evict-leader-scheduler is disabled, it will obstruct the restart operation of tikv by the operator.
// Refer: https://docs.pingcap.com/tidb-in-kubernetes/stable/restart-a-tidb-cluster#perform-a-graceful-restart-to-a-single-tikv-pod
if labelMgr.ScheduleDisabled(region) && !isEvictLeaderScheduler {
okJiang marked this conversation as resolved.
Show resolved Hide resolved
denySchedulersByLabelerCounter.Inc()
continue retry
}
Expand Down
3 changes: 3 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,9 @@
return errs.ErrInvalidStoreID.FastGenByArgs(storeID)
}
newStore := typeutil.DeepClone(store.GetMeta(), core.StoreFactory)
if len(newStore.GetLabels()) == 0 {
return errors.Errorf("the label key %s does not exist", labelKey)

Check warning on line 1211 in server/cluster/cluster.go

View check run for this annotation

Codecov / codecov/patch

server/cluster/cluster.go#L1211

Added line #L1211 was not covered by tests
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check it before cloning?

Copy link
Member Author

@okJiang okJiang Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clone store before L1223(newStore.Labels = labels)?

labels := make([]*metapb.StoreLabel, 0, len(newStore.GetLabels())-1)
for _, label := range newStore.GetLabels() {
if label.Key == labelKey {
Expand Down
10 changes: 9 additions & 1 deletion tests/integrations/realcluster/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,17 @@ kill_cluster:
echo "waiting for cluster to exit..."; \
sleep 30; \
fi
@ rm -rf ~/.tiup/data/pd_real_cluster_test
okJiang marked this conversation as resolved.
Show resolved Hide resolved

test:
CGO_ENABLED=1 go test ./... -v -tags deadlock -race -cover || { exit 1; }
CGO_ENABLED=1 go test ./... -v -tags deadlock -race -cover || (\
echo "follow is pd-0 log\n" ; \
cat ~/.tiup/data/pd_real_cluster_test/pd-0/pd.log ; \
echo "follow is pd-1 log\n" ; \
cat ~/.tiup/data/pd_real_cluster_test/pd-1/pd.log ; \
echo "follow is pd-2 log\n" ; \
cat ~/.tiup/data/pd_real_cluster_test/pd-2/pd.log ; \
exit 1)

install-tools:
cd $(ROOT_PATH) && $(MAKE) install-tools
10 changes: 7 additions & 3 deletions tests/integrations/realcluster/deploy.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/bin/bash
# deploy `tiup playground`

set -x

TIUP_BIN_DIR=$HOME/.tiup/bin/tiup
CUR_PATH=$(pwd)

Expand All @@ -19,13 +21,15 @@ if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] &&
color-green "downloading binaries..."
color-green "this may take a few minutes, you can also download them manually and put them in the bin directory."
make pd-server WITH_RACE=1
$TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \
okJiang marked this conversation as resolved.
Show resolved Hide resolved
--pd.binpath ./bin/pd-server \
$TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_real_cluster_test \
--pd.binpath ./bin/pd-server --pd.config ./tests/integrations/realcluster/pd.toml \
> $CUR_PATH/playground.log 2>&1 &
else
color-green "using existing binaries..."
make pd-server WITH_RACE=1
Copy link
Member

@HuSharp HuSharp Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this line, as the comment above explains that there is no need to make pd :) may reduce test time

$TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor \
--pd.binpath ./bin/pd-server --kv.binpath ./bin/tikv-server --db.binpath ./bin/tidb-server --tiflash.binpath ./bin/tiflash --tag pd_test \
--pd.binpath ./bin/pd-server --kv.binpath ./bin/tikv-server --db.binpath ./bin/tidb-server \
--tiflash.binpath ./bin/tiflash --tag pd_real_cluster_test --pd.config ./tests/integrations/realcluster/pd.toml \
> $CUR_PATH/playground.log 2>&1 &
fi

Expand Down
5 changes: 5 additions & 0 deletions tests/integrations/realcluster/pd.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[schedule]
patrol-region-interval = "100ms"

[log]
level = "debug"
98 changes: 46 additions & 52 deletions tests/integrations/realcluster/reboot_pd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,61 +14,55 @@

package realcluster

import (
"context"
"os/exec"
"testing"
// func restartTiUP() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about using the build flag enable_flaky_tests?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite sure how enable_flaky_tests works. I reverted the changes to this file first because the test is just not working, not because it's unstable. ad19424

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a flaky test, can be fixed by #8303 (comment)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fixed it. fa47700

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got

// log.Info("start to restart TiUP")
// cmd := exec.Command("make", "deploy")
// err := cmd.Run()
// if err != nil {
// panic(err)
// }
// log.Info("TiUP restart success")
// }

"github.com/pingcap/log"
"github.com/stretchr/testify/require"
)
// // https://github.com/tikv/pd/issues/6467
// func TestReloadLabel(t *testing.T) {
// re := require.New(t)
// ctx := context.Background()

func restartTiUP() {
log.Info("start to restart TiUP")
cmd := exec.Command("make", "deploy")
err := cmd.Run()
if err != nil {
panic(err)
}
log.Info("TiUP restart success")
}
// resp, _ := pdHTTPCli.GetStores(ctx)
// setStore := resp.Stores[0]
// // TiFlash labels will be ["engine": "tiflash"]
// storeLabel := map[string]string{
// "zone": "zone1",
// }
// for _, label := range setStore.Store.Labels {
// storeLabel[label.Key] = label.Value
// }

// https://github.com/tikv/pd/issues/6467
func TestReloadLabel(t *testing.T) {
re := require.New(t)
ctx := context.Background()
// re.NoError(pdHTTPCli.SetStoreLabels(ctx, setStore.Store.ID, storeLabel))
// defer func() {
// pdHTTPCli.DeleteStoreLabel(ctx, setStore.Store.ID, "zone")
// }()

resp, _ := pdHTTPCli.GetStores(ctx)
setStore := resp.Stores[0]
// TiFlash labels will be ["engine": "tiflash"]
storeLabel := map[string]string{
"zone": "zone1",
}
for _, label := range setStore.Store.Labels {
storeLabel[label.Key] = label.Value
}
err := pdHTTPCli.SetStoreLabels(ctx, setStore.Store.ID, storeLabel)
re.NoError(err)
// resp, err := pdHTTPCli.GetStores(ctx)
// re.NoError(err)
// for _, store := range resp.Stores {
// if store.Store.ID == setStore.Store.ID {
// for _, label := range store.Store.Labels {
// re.Equal(label.Value, storeLabel[label.Key])
// }
// }
// }

resp, err = pdHTTPCli.GetStores(ctx)
re.NoError(err)
for _, store := range resp.Stores {
if store.Store.ID == setStore.Store.ID {
for _, label := range store.Store.Labels {
re.Equal(label.Value, storeLabel[label.Key])
}
}
}
// restartTiUP()

restartTiUP()

resp, err = pdHTTPCli.GetStores(ctx)
re.NoError(err)
for _, store := range resp.Stores {
if store.Store.ID == setStore.Store.ID {
for _, label := range store.Store.Labels {
re.Equal(label.Value, storeLabel[label.Key])
}
}
}
}
// resp, err = pdHTTPCli.GetStores(ctx)
// re.NoError(err)
// for _, store := range resp.Stores {
// if store.Store.ID == setStore.Store.ID {
// for _, label := range store.Store.Labels {
okJiang marked this conversation as resolved.
Show resolved Hide resolved
// re.Equal(label.Value, storeLabel[label.Key])
// }
// }
// }
// }
Loading