Skip to content

Commit 43d5189

Browse files
authored
Images: Fix "database is locked" errors when deleting old image records after update (#15716)
Fixes this intermittent test failure by leveraging the existing query retry logic provided by `s.DB.Cluster.Transaction()` function. ``` time="2025-06-03T21:37:57Z" level=error msg="Error deleting old image from database" ID=5 err="database is locked" fingerprint=119e2f4d376a0d9c3c3576bc6ef6e77e8e4146c6b1edf2f0b8dded3354cfed57 ``` Also makes some clean up to the image distribution functions to reduce the amount of transactions and queries they do. There's more that can be done though.
2 parents a5b26ba + 5c9d090 commit 43d5189

1 file changed

Lines changed: 65 additions & 74 deletions

File tree

lxd/images.go

Lines changed: 65 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,9 +1889,7 @@ func autoUpdateImagesTask(stateFunc func() *state.State) (task.Func, task.Schedu
18891889
func autoUpdateImages(ctx context.Context, s *state.State) error {
18901890
imageMap := make(map[string][]dbCluster.Image)
18911891

1892-
err := s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
1893-
var err error
1894-
1892+
err := s.DB.Cluster.Transaction(ctx, func(ctx context.Context, tx *db.ClusterTx) error {
18951893
autoUpdate := true
18961894
images, err := dbCluster.GetImages(ctx, tx.Tx(), dbCluster.ImageFilter{AutoUpdate: &autoUpdate})
18971895
if err != nil {
@@ -1909,72 +1907,59 @@ func autoUpdateImages(ctx context.Context, s *state.State) error {
19091907
}
19101908

19111909
for fingerprint, images := range imageMap {
1912-
skipFingerprint := false
1913-
1914-
var nodes []string
1910+
var nodes []db.NodeInfo
19151911

19161912
err = s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
1917-
nodes, err = tx.GetNodesWithImageAndAutoUpdate(ctx, fingerprint, true)
1913+
nodeAddresses, err := tx.GetNodesWithImageAndAutoUpdate(ctx, fingerprint, true)
1914+
if err != nil {
1915+
return fmt.Errorf("Failed getting cluster members with auto-update images: %w", err)
1916+
}
1917+
1918+
for _, nodeAddress := range nodeAddresses {
1919+
nodeInfo, err := tx.GetNodeByAddress(ctx, nodeAddress)
1920+
if err != nil {
1921+
return fmt.Errorf("Failed retrieving cluster member information for %q: %w", nodeAddress, err)
1922+
}
1923+
1924+
nodes = append(nodes, nodeInfo)
1925+
}
19181926

19191927
return err
19201928
})
19211929
if err != nil {
1922-
logger.Error("Error getting cluster members for image auto-update", logger.Ctx{"fingerprint": fingerprint, "err": err})
1930+
logger.Warn("Failed getting image auto-update info", logger.Ctx{"member": s.ServerName, "fingerprint": fingerprint, "err": err})
19231931
continue
19241932
}
19251933

19261934
if len(nodes) > 1 {
1927-
var nodeIDs []int64
1928-
1935+
nodeIDs := make([]int64, 0, len(nodes))
19291936
for _, node := range nodes {
1930-
err := s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
1931-
var err error
1932-
1933-
nodeInfo, err := tx.GetNodeByAddress(ctx, node)
1934-
if err != nil {
1935-
return err
1936-
}
1937-
1938-
nodeIDs = append(nodeIDs, nodeInfo.ID)
1939-
1940-
return nil
1941-
})
1942-
if err != nil {
1943-
logger.Error("Unable to retrieve cluster member information for image update", logger.Ctx{"err": err})
1944-
skipFingerprint = true
1945-
break
1946-
}
1937+
nodeIDs = append(nodeIDs, node.ID)
19471938
}
19481939

1949-
if skipFingerprint {
1940+
// If multiple nodes have the image, select one to deal with it.
1941+
selectedNode, err := util.GetStableRandomInt64FromList(int64(len(images)), nodeIDs)
1942+
if err != nil {
1943+
logger.Error("Failed to select cluster member for image update", logger.Ctx{"err": err})
19501944
continue
19511945
}
19521946

1953-
// If multiple nodes have the image, select one to deal with it.
1954-
if len(nodeIDs) > 1 {
1955-
selectedNode, err := util.GetStableRandomInt64FromList(int64(len(images)), nodeIDs)
1956-
if err != nil {
1957-
logger.Error("Failed to select cluster member for image update", logger.Ctx{"err": err})
1958-
continue
1959-
}
1960-
1961-
// Skip image update if we're not the chosen cluster member.
1962-
// That way, an image is only updated by a single cluster member.
1963-
if s.DB.Cluster.GetNodeID() != selectedNode {
1964-
continue
1965-
}
1947+
// Skip image update if we're not the chosen cluster member.
1948+
// That way, an image is only updated by a single cluster member.
1949+
if s.DB.Cluster.GetNodeID() != selectedNode {
1950+
continue
19661951
}
19671952
}
19681953

19691954
var deleteIDs []int
19701955
var newImage *api.Image
19711956

19721957
for _, image := range images {
1973-
imgProject := image.Project
1974-
filter := dbCluster.ImageFilter{Project: &imgProject}
1958+
l := logger.AddContext(logger.Ctx{"member": s.ServerName, "project": image.Project, "fingerprint": image.Fingerprint})
1959+
1960+
filter := dbCluster.ImageFilter{Project: &image.Project}
19751961
if image.Public {
1976-
imgPublic := image.Public
1977-
filter.Public = &imgPublic
1962+
filter.Public = &image.Public
19781963
}
19791964

19801965
var imageInfo *api.Image
@@ -1985,13 +1970,13 @@ func autoUpdateImages(ctx context.Context, s *state.State) error {
19851970
return err
19861971
})
19871972
if err != nil {
1988-
logger.Error("Failed to get image", logger.Ctx{"err": err, "project": image.Project, "fingerprint": image.Fingerprint})
1973+
l.Error("Failed to get image", logger.Ctx{"err": err})
19891974
continue
19901975
}
19911976

19921977
newInfo, err := autoUpdateImage(ctx, s, nil, image.ID, imageInfo, image.Project, false)
19931978
if err != nil {
1994-
logger.Error("Failed to update image", logger.Ctx{"err": err, "project": image.Project, "fingerprint": image.Fingerprint})
1979+
l.Error("Failed to update image", logger.Ctx{"err": err})
19951980

19961981
if err == context.Canceled {
19971982
return nil
@@ -2011,33 +1996,36 @@ func autoUpdateImages(ctx context.Context, s *state.State) error {
20111996
if len(nodes) > 1 {
20121997
err := distributeImage(ctx, s, nodes, fingerprint, newImage)
20131998
if err != nil {
2014-
logger.Error("Failed to distribute new image", logger.Ctx{"err": err, "fingerprint": newImage.Fingerprint})
1999+
logger.Error("Failed to distribute new image", logger.Ctx{"member": s.ServerName, "fingerprint": newImage.Fingerprint, "err": err})
20152000

20162001
if err == context.Canceled {
20172002
return nil
20182003
}
20192004
}
20202005
}
20212006

2022-
_ = s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
2007+
err = s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
20232008
for _, ID := range deleteIDs {
20242009
// Remove the database entry for the image after distributing to cluster members.
20252010
err := tx.DeleteImage(ctx, ID)
20262011
if err != nil {
2027-
logger.Error("Error deleting old image from database", logger.Ctx{"err": err, "fingerprint": fingerprint, "ID": ID})
2012+
return fmt.Errorf(`Failed deleting image record with ID "%d": %w`, ID, err)
20282013
}
20292014
}
20302015

20312016
return nil
20322017
})
2018+
if err != nil {
2019+
logger.Error("Error deleting old image(s) records", logger.Ctx{"member": s.ServerName, "fingerprint": fingerprint, "err": err})
2020+
}
20332021
}
20342022
}
20352023

20362024
return nil
20372025
}
20382026

2039-
func distributeImage(ctx context.Context, s *state.State, nodes []string, oldFingerprint string, newImage *api.Image) error {
2040-
logger.Info("Distributing image to members", logger.Ctx{"fingerprint": newImage.Fingerprint, "member": s.ServerName, "targets": nodes})
2027+
func distributeImage(ctx context.Context, s *state.State, nodes []db.NodeInfo, oldFingerprint string, newImage *api.Image) error {
2028+
logger.Info("Distributing image to members", logger.Ctx{"fingerprint": newImage.Fingerprint, "member": s.ServerName, "remotes": len(nodes)})
20412029

20422030
// Get config of all nodes (incl. own) and check for storage.images_volume.
20432031
// If the setting is missing, distribute the image to the node.
@@ -2113,31 +2101,22 @@ func distributeImage(ctx context.Context, s *state.State, nodes []string, oldFin
21132101
return err
21142102
}
21152103

2116-
for _, nodeAddress := range nodes {
2117-
if nodeAddress == localClusterAddress {
2104+
for _, node := range nodes {
2105+
if node.Address == localClusterAddress {
21182106
continue
21192107
}
21202108

2121-
var nodeInfo db.NodeInfo
2122-
err = s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
2123-
nodeInfo, err = tx.GetNodeByAddress(ctx, nodeAddress)
2124-
return err
2125-
})
2126-
if err != nil {
2127-
return fmt.Errorf("Failed to retrieve information about cluster member with address %q: %w", nodeAddress, err)
2128-
}
2129-
21302109
err = func() error {
2131-
client, err := cluster.Connect(nodeAddress, s.Endpoints.NetworkCert(), s.ServerCert(), nil, true)
2110+
client, err := cluster.Connect(node.Address, s.Endpoints.NetworkCert(), s.ServerCert(), nil, true)
21322111
if err != nil {
2133-
return fmt.Errorf("Failed to connect to %q for image synchronization: %w", nodeAddress, err)
2112+
return fmt.Errorf("Failed to connect to %q for image synchronization: %w", node.Address, err)
21342113
}
21352114

2136-
client = client.UseTarget(nodeInfo.Name)
2115+
client = client.UseTarget(node.Name)
21372116

21382117
resp, _, err := client.GetServer()
21392118
if err != nil {
2140-
logger.Error("Failed to retrieve information about cluster member", logger.Ctx{"err": err, "remote": nodeAddress})
2119+
logger.Error("Failed to retrieve information about cluster member", logger.Ctx{"err": err, "remote": node.Address})
21412120
} else {
21422121
vol := ""
21432122

@@ -2204,7 +2183,7 @@ func distributeImage(ctx context.Context, s *state.State, nodes []string, oldFin
22042183
Filename: createArgs.MetaName,
22052184
}
22062185

2207-
logger.Info("Distributing image to member", logger.Ctx{"member": s.ServerName, "target": nodeInfo.Name, "fingerprint": newImage.Fingerprint})
2186+
logger.Info("Distributing image to member", logger.Ctx{"member": s.ServerName, "remote": node.Name, "fingerprint": newImage.Fingerprint})
22082187
op, err := client.CreateImage(image, createArgs)
22092188
if err != nil {
22102189
return err
@@ -2233,19 +2212,19 @@ func distributeImage(ctx context.Context, s *state.State, nodes []string, oldFin
22332212

22342213
_, _, err = client.RawQuery(http.MethodPost, "/internal/image-optimize", req, "")
22352214
if err != nil {
2236-
logger.Error("Failed creating new image in storage pool", logger.Ctx{"err": err, "remote": nodeAddress, "pool": poolName, "fingerprint": newImage.Fingerprint})
2215+
logger.Error("Failed creating new image in storage pool", logger.Ctx{"err": err, "remote": node.Address, "pool": poolName, "fingerprint": newImage.Fingerprint})
22372216
}
22382217

22392218
err = client.DeleteStoragePoolVolume(poolName, "image", oldFingerprint)
22402219
if err != nil {
2241-
logger.Error("Failed deleting old image from storage pool", logger.Ctx{"err": err, "remote": nodeAddress, "pool": poolName, "fingerprint": oldFingerprint})
2220+
logger.Error("Failed deleting old image from storage pool", logger.Ctx{"err": err, "remote": node.Address, "pool": poolName, "fingerprint": oldFingerprint})
22422221
}
22432222
}
22442223

22452224
return nil
22462225
}()
22472226
if err != nil {
2248-
return fmt.Errorf("Failed distributing image %q to %q: %w", newImage.Fingerprint, nodeInfo.Name, err)
2227+
return fmt.Errorf("Failed distributing image %q to %q: %w", newImage.Fingerprint, node.Name, err)
22492228
}
22502229
}
22512230

@@ -4668,12 +4647,24 @@ func imageRefresh(d *Daemon, r *http.Request) response.Response {
46684647

46694648
// Begin background operation
46704649
run := func(op *operations.Operation) error {
4671-
var nodes []string
4650+
var nodes []db.NodeInfo
46724651

46734652
err = s.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error {
4674-
nodes, err = tx.GetNodesWithImageAndAutoUpdate(ctx, details.imageFingerprintPrefix, true)
4653+
nodeAddresses, err := tx.GetNodesWithImageAndAutoUpdate(ctx, details.imageFingerprintPrefix, true)
4654+
if err != nil {
4655+
return fmt.Errorf("Failed getting cluster members with auto-update images: %w", err)
4656+
}
46754657

4676-
return err
4658+
for _, nodeAddress := range nodeAddresses {
4659+
nodeInfo, err := tx.GetNodeByAddress(ctx, nodeAddress)
4660+
if err != nil {
4661+
return fmt.Errorf("Failed retrieving cluster member information for %q: %w", nodeAddress, err)
4662+
}
4663+
4664+
nodes = append(nodes, nodeInfo)
4665+
}
4666+
4667+
return nil
46774668
})
46784669
if err != nil {
46794670
return fmt.Errorf("Error getting cluster members for refreshing image %q in project %q: %w", details.imageFingerprintPrefix, projectName, err)

0 commit comments

Comments
 (0)