weed/wdclient/filer_client.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404

package wdclient

import (
	"context"
	"fmt"
	"math/rand"
	"strings"
	"sync/atomic"
	"time"

	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/pb"
	"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
)

// UrlPreference controls which URL to use for volume access
type UrlPreference string

const (
	PreferUrl       UrlPreference = "url"       // Use private URL (default)
	PreferPublicUrl UrlPreference = "publicUrl" // Use public URL
)

// filerHealth tracks the health status of a filer
type filerHealth struct {
	failureCount      int32 // atomic: consecutive failures
	lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds
}

// FilerClient provides volume location services by querying a filer
// It uses the shared vidMap cache for efficient lookups
// Supports multiple filer addresses with automatic failover for high availability
// Tracks filer health to avoid repeatedly trying known-unhealthy filers
type FilerClient struct {
	*vidMapClient
	filerAddresses     []pb.ServerAddress
	filerIndex         int32          // atomic: current filer index for round-robin
	filerHealth        []*filerHealth // health status per filer (same order as filerAddresses)
	grpcDialOption     grpc.DialOption
	urlPreference      UrlPreference
	grpcTimeout        time.Duration
	cacheSize          int           // Number of historical vidMap snapshots to keep
	clientId           int32         // Unique client identifier for gRPC metadata
	failureThreshold   int32         // Circuit breaker: consecutive failures before circuit opens
	resetTimeout       time.Duration // Circuit breaker: time before re-checking unhealthy filer
	maxRetries         int           // Retry: maximum retry attempts for transient failures
	initialRetryWait   time.Duration // Retry: initial wait time before first retry
	retryBackoffFactor float64       // Retry: backoff multiplier for wait time
}

// filerVolumeProvider implements VolumeLocationProvider by querying filer
// Supports multiple filer addresses with automatic failover
type filerVolumeProvider struct {
	filerClient *FilerClient
}

// FilerClientOption holds optional configuration for FilerClient
type FilerClientOption struct {
	GrpcTimeout        time.Duration
	UrlPreference      UrlPreference
	CacheSize          int           // Number of historical vidMap snapshots (0 = use default)
	FailureThreshold   int32         // Circuit breaker: consecutive failures before skipping filer (0 = use default of 3)
	ResetTimeout       time.Duration // Circuit breaker: time before re-checking unhealthy filer (0 = use default of 30s)
	MaxRetries         int           // Retry: maximum retry attempts for transient failures (0 = use default of 3)
	InitialRetryWait   time.Duration // Retry: initial wait time before first retry (0 = use default of 1s)
	RetryBackoffFactor float64       // Retry: backoff multiplier for wait time (0 = use default of 1.5)
}

// NewFilerClient creates a new client that queries filer(s) for volume locations
// Supports multiple filer addresses for high availability with automatic failover
// Uses sensible defaults: 5-second gRPC timeout, PreferUrl, DefaultVidMapCacheSize
func NewFilerClient(filerAddresses []pb.ServerAddress, grpcDialOption grpc.DialOption, dataCenter string, opts ...*FilerClientOption) *FilerClient {
	if len(filerAddresses) == 0 {
		glog.Fatal("NewFilerClient requires at least one filer address")
	}

	// Apply defaults
	grpcTimeout := 5 * time.Second
	urlPref := PreferUrl
	cacheSize := DefaultVidMapCacheSize
	failureThreshold := int32(3)     // Default: 3 consecutive failures before circuit opens
	resetTimeout := 30 * time.Second // Default: 30 seconds before re-checking unhealthy filer
	maxRetries := 3                  // Default: 3 retry attempts for transient failures
	initialRetryWait := time.Second  // Default: 1 second initial retry wait
	retryBackoffFactor := 1.5        // Default: 1.5x backoff multiplier

	// Override with provided options
	if len(opts) > 0 && opts[0] != nil {
		opt := opts[0]
		if opt.GrpcTimeout > 0 {
			grpcTimeout = opt.GrpcTimeout
		}
		if opt.UrlPreference != "" {
			urlPref = opt.UrlPreference
		}
		if opt.CacheSize > 0 {
			cacheSize = opt.CacheSize
		}
		if opt.FailureThreshold > 0 {
			failureThreshold = opt.FailureThreshold
		}
		if opt.ResetTimeout > 0 {
			resetTimeout = opt.ResetTimeout
		}
		if opt.MaxRetries > 0 {
			maxRetries = opt.MaxRetries
		}
		if opt.InitialRetryWait > 0 {
			initialRetryWait = opt.InitialRetryWait
		}
		if opt.RetryBackoffFactor > 0 {
			retryBackoffFactor = opt.RetryBackoffFactor
		}
	}

	// Initialize health tracking for each filer
	health := make([]*filerHealth, len(filerAddresses))
	for i := range health {
		health[i] = &filerHealth{}
	}

	fc := &FilerClient{
		filerAddresses:     filerAddresses,
		filerIndex:         0,
		filerHealth:        health,
		grpcDialOption:     grpcDialOption,
		urlPreference:      urlPref,
		grpcTimeout:        grpcTimeout,
		cacheSize:          cacheSize,
		clientId:           rand.Int31(), // Random client ID for gRPC metadata tracking
		failureThreshold:   failureThreshold,
		resetTimeout:       resetTimeout,
		maxRetries:         maxRetries,
		initialRetryWait:   initialRetryWait,
		retryBackoffFactor: retryBackoffFactor,
	}

	// Create provider that references this FilerClient for failover support
	provider := &filerVolumeProvider{
		filerClient: fc,
	}

	fc.vidMapClient = newVidMapClient(provider, dataCenter, cacheSize)

	return fc
}

// GetLookupFileIdFunction returns a lookup function with URL preference handling
func (fc *FilerClient) GetLookupFileIdFunction() LookupFileIdFunctionType {
	if fc.urlPreference == PreferUrl {
		// Use the default implementation from vidMapClient
		return fc.vidMapClient.GetLookupFileIdFunction()
	}

	// Custom implementation that prefers PublicUrl
	return func(ctx context.Context, fileId string) (fullUrls []string, err error) {
		// Parse file ID to extract volume ID
		parts := strings.Split(fileId, ",")
		if len(parts) != 2 {
			return nil, fmt.Errorf("invalid fileId format: %s", fileId)
		}
		volumeIdStr := parts[0]

		// First try the cache using LookupVolumeIdsWithFallback
		vidLocations, err := fc.LookupVolumeIdsWithFallback(ctx, []string{volumeIdStr})

		// Check for partial results first (important for multi-volume batched lookups)
		locations, found := vidLocations[volumeIdStr]
		if !found || len(locations) == 0 {
			// Volume not found - return specific error with context from lookup if available
			if err != nil {
				return nil, fmt.Errorf("volume %s not found for fileId %s: %w", volumeIdStr, fileId, err)
			}
			return nil, fmt.Errorf("volume %s not found for fileId %s", volumeIdStr, fileId)
		}

		// Volume found successfully - ignore any errors about other volumes
		// (not relevant for single-volume lookup, but defensive for future batching)

		// Build URLs with publicUrl preference, and also prefer same DC
		var sameDcUrls, otherDcUrls []string
		dataCenter := fc.GetDataCenter()
		for _, loc := range locations {
			url := loc.PublicUrl
			if url == "" {
				url = loc.Url
			}
			httpUrl := "http://" + url + "/" + fileId
			if dataCenter != "" && dataCenter == loc.DataCenter {
				sameDcUrls = append(sameDcUrls, httpUrl)
			} else {
				otherDcUrls = append(otherDcUrls, httpUrl)
			}
		}
		// Shuffle to distribute load across volume servers
		rand.Shuffle(len(sameDcUrls), func(i, j int) { sameDcUrls[i], sameDcUrls[j] = sameDcUrls[j], sameDcUrls[i] })
		rand.Shuffle(len(otherDcUrls), func(i, j int) { otherDcUrls[i], otherDcUrls[j] = otherDcUrls[j], otherDcUrls[i] })
		// Prefer same data center
		fullUrls = append(sameDcUrls, otherDcUrls...)
		return fullUrls, nil
	}
}

// isRetryableGrpcError checks if a gRPC error is transient and should be retried
//
// Note on codes.Aborted: While Aborted can indicate application-level conflicts
// (e.g., transaction failures), in the context of volume location lookups (which
// are simple read-only operations with no transactions), Aborted is more likely
// to indicate transient server issues during restart/recovery. We include it here
// for volume lookups but log it for visibility in case misclassification occurs.
func isRetryableGrpcError(err error) bool {
	if err == nil {
		return false
	}

	// Check gRPC status code
	st, ok := status.FromError(err)
	if ok {
		switch st.Code() {
		case codes.Unavailable: // Server unavailable (temporary)
			return true
		case codes.DeadlineExceeded: // Request timeout
			return true
		case codes.ResourceExhausted: // Rate limited or overloaded
			return true
		case codes.Aborted:
			// Aborted during read-only volume lookups is likely transient
			// (e.g., filer restarting), but log for visibility
			glog.V(1).Infof("Treating Aborted as retryable for volume lookup: %v", err)
			return true
		}
	}

	// Fallback to string matching for non-gRPC errors (e.g., network errors)
	errStr := err.Error()
	return strings.Contains(errStr, "transport") ||
		strings.Contains(errStr, "connection") ||
		strings.Contains(errStr, "timeout") ||
		strings.Contains(errStr, "unavailable")
}

// shouldSkipUnhealthyFiler checks if we should skip a filer based on recent failures
// Circuit breaker pattern: skip filers with multiple recent consecutive failures
func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
	health := fc.filerHealth[index]
	failureCount := atomic.LoadInt32(&health.failureCount)

	// Check if failure count exceeds threshold
	if failureCount < fc.failureThreshold {
		return false
	}

	// Re-check unhealthy filers after reset timeout
	lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
	if lastFailureNs == 0 {
		return false // Never failed, shouldn't skip
	}
	lastFailureTime := time.Unix(0, lastFailureNs)
	if time.Since(lastFailureTime) > fc.resetTimeout {
		return false // Time to re-check
	}

	return true // Skip this unhealthy filer
}

// recordFilerSuccess resets failure tracking for a successful filer
func (fc *FilerClient) recordFilerSuccess(index int32) {
	health := fc.filerHealth[index]
	atomic.StoreInt32(&health.failureCount, 0)
}

// recordFilerFailure increments failure count for an unhealthy filer
func (fc *FilerClient) recordFilerFailure(index int32) {
	health := fc.filerHealth[index]
	atomic.AddInt32(&health.failureCount, 1)
	atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())
}

// LookupVolumeIds queries the filer for volume locations with automatic failover
// Tries all configured filer addresses until one succeeds (high availability)
// Retries transient gRPC errors (Unavailable, DeadlineExceeded, etc.) with exponential backoff
// Note: Unlike master's VolumeIdLocation, filer's Locations message doesn't currently have
// an Error field. This implementation handles the current structure while being prepared
// for future error reporting enhancements.
func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []string) (map[string][]Location, error) {
	fc := p.filerClient
	result := make(map[string][]Location)

	// Retry transient failures with configurable backoff
	var lastErr error
	waitTime := fc.initialRetryWait
	maxRetries := fc.maxRetries

	for retry := 0; retry < maxRetries; retry++ {
		// Try all filer addresses with round-robin starting from current index
		// Skip known-unhealthy filers (circuit breaker pattern)
		i := atomic.LoadInt32(&fc.filerIndex)
		n := int32(len(fc.filerAddresses))

		for x := int32(0); x < n; x++ {
			// Circuit breaker: skip unhealthy filers
			if fc.shouldSkipUnhealthyFiler(i) {
				glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)",
					fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount))
				i++
				if i >= n {
					i = 0
				}
				continue
			}

			filerAddress := fc.filerAddresses[i]

			// Use anonymous function to ensure defer cancel() is called per iteration, not accumulated
			err := func() error {
				// Create a fresh timeout context for each filer attempt
				// This ensures each retry gets the full grpcTimeout, not a diminishing deadline
				timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout)
				defer cancel() // Always clean up context, even on panic or early return

				return pb.WithGrpcFilerClient(false, fc.clientId, filerAddress, fc.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
					resp, err := client.LookupVolume(timeoutCtx, &filer_pb.LookupVolumeRequest{
						VolumeIds: volumeIds,
					})
					if err != nil {
						return fmt.Errorf("filer.LookupVolume failed: %w", err)
					}

					// Process each volume in the response
					for vid, locs := range resp.LocationsMap {
						// Convert locations from protobuf to internal format
						var locations []Location
						for _, loc := range locs.Locations {
							locations = append(locations, Location{
								Url:        loc.Url,
								PublicUrl:  loc.PublicUrl,
								DataCenter: loc.DataCenter,
								GrpcPort:   int(loc.GrpcPort),
							})
						}

						// Only add to result if we have locations
						// Empty locations with no gRPC error means "not found" (volume doesn't exist)
						if len(locations) > 0 {
							result[vid] = locations
							glog.V(4).Infof("FilerClient: volume %s found with %d location(s)", vid, len(locations))
						} else {
							glog.V(2).Infof("FilerClient: volume %s not found (no locations in response)", vid)
						}
					}

					// Check for volumes that weren't in the response at all
					// This could indicate a problem with the filer
					for _, vid := range volumeIds {
						if _, found := resp.LocationsMap[vid]; !found {
							glog.V(1).Infof("FilerClient: volume %s missing from filer response", vid)
						}
					}

					return nil
				})
			}()

			if err != nil {
				glog.V(1).Infof("FilerClient: filer %s lookup failed (attempt %d/%d, retry %d/%d): %v", filerAddress, x+1, n, retry+1, maxRetries, err)
				fc.recordFilerFailure(i)
				lastErr = err
				i++
				if i >= n {
					i = 0
				}
				continue
			}

			// Success - update the preferred filer index and reset health tracking
			atomic.StoreInt32(&fc.filerIndex, i)
			fc.recordFilerSuccess(i)
			glog.V(3).Infof("FilerClient: looked up %d volumes on %s, found %d", len(volumeIds), filerAddress, len(result))
			return result, nil
		}

		// All filers failed on this attempt
		// Check if the error is retryable (transient gRPC error)
		if !isRetryableGrpcError(lastErr) {
			// Non-retryable error (e.g., NotFound, PermissionDenied) - fail immediately
			return nil, fmt.Errorf("all %d filer(s) failed with non-retryable error: %w", n, lastErr)
		}

		// Transient error - retry if we have attempts left
		if retry < maxRetries-1 {
			glog.V(1).Infof("FilerClient: all %d filer(s) failed with retryable error (attempt %d/%d), retrying in %v: %v",
				n, retry+1, maxRetries, waitTime, lastErr)
			time.Sleep(waitTime)
			waitTime = time.Duration(float64(waitTime) * fc.retryBackoffFactor)
		}
	}

	// All retries exhausted
	return nil, fmt.Errorf("all %d filer(s) failed after %d attempts, last error: %w", len(fc.filerAddresses), maxRetries, lastErr)
}