1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
|
package ml
import (
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
// AccessPattern represents different file access patterns
type AccessPattern int
const (
RandomAccess AccessPattern = iota
SequentialAccess
StridedAccess // Common in image datasets - fixed stride between accesses
BatchAccess // Multiple files accessed together
EpochAccess // Dataset restart patterns (ML training)
ModelAccess // Large model checkpoint loading
)
func (ap AccessPattern) String() string {
switch ap {
case RandomAccess:
return "Random"
case SequentialAccess:
return "Sequential"
case StridedAccess:
return "Strided"
case BatchAccess:
return "Batch"
case EpochAccess:
return "Epoch"
case ModelAccess:
return "Model"
default:
return "Unknown"
}
}
// AccessEvent represents a single file access event
type AccessEvent struct {
Timestamp time.Time
Inode uint64
Offset int64
Size int
ReadType string // "sequential", "random", etc.
}
// AccessInfo contains access pattern information for a file
type AccessInfo struct {
Inode uint64
LastOffset int64
LastAccessTime time.Time
LastSize int
ConsecutiveSeq int // Count of consecutive sequential reads
TotalAccesses int
BytesRead int64
Pattern AccessPattern
Confidence float64 // Confidence in pattern detection (0.0-1.0)
PrefetchSize int64 // Recommended prefetch size
}
// AccessPatternDetector detects and analyzes file access patterns for ML workloads
type AccessPatternDetector struct {
sync.RWMutex
// Configuration
maxHistory int
sequentialThreshold int // Minimum consecutive reads to consider sequential
maxGapSize int64 // Maximum gap to still consider sequential
stridedMinRepeats int // Minimum repeats to detect strided access
confidenceThreshold float64 // Minimum confidence to act on pattern
// Per-file tracking
fileInfo map[uint64]*AccessInfo
// Global access history for cross-file pattern detection
recentAccesses []AccessEvent
// ML-specific heuristics
enableMLHeuristics bool
imageFileExtensions map[string]bool
modelFileExtensions map[string]bool
// Metrics
totalAccesses int64
sequentialReads int64
randomReads int64
prefetchTriggered int64
}
// NewAccessPatternDetector creates a new access pattern detector optimized for ML workloads
func NewAccessPatternDetector() *AccessPatternDetector {
return &AccessPatternDetector{
maxHistory: 1000,
sequentialThreshold: 3,
maxGapSize: 64 * 1024, // 64KB
stridedMinRepeats: 3,
confidenceThreshold: 0.6,
fileInfo: make(map[uint64]*AccessInfo),
recentAccesses: make([]AccessEvent, 0, 1000),
enableMLHeuristics: true,
imageFileExtensions: map[string]bool{
"jpg": true, "jpeg": true, "png": true, "bmp": true,
"tiff": true, "webp": true, "raw": true,
},
modelFileExtensions: map[string]bool{
"pt": true, "pth": true, "pkl": true, "h5": true,
"pb": true, "onnx": true, "tflite": true, "caffemodel": true,
},
}
}
// RecordAccess records a file access and updates pattern detection
func (apd *AccessPatternDetector) RecordAccess(inode uint64, offset int64, size int) *AccessInfo {
apd.Lock()
defer apd.Unlock()
now := time.Now()
apd.totalAccesses++
// Get or create file info
info := apd.fileInfo[inode]
if info == nil {
info = &AccessInfo{
Inode: inode,
LastOffset: -1,
Pattern: RandomAccess,
PrefetchSize: 0,
}
apd.fileInfo[inode] = info
}
// Update basic stats
info.TotalAccesses++
info.BytesRead += int64(size)
// Detect access pattern
apd.detectPattern(info, offset, size, now)
// Record in global history for cross-file analysis
event := AccessEvent{
Timestamp: now,
Inode: inode,
Offset: offset,
Size: size,
}
apd.addToHistory(event)
// Update timing
info.LastAccessTime = now
info.LastOffset = offset
info.LastSize = size
glog.V(4).Infof("Access pattern for inode %d: %s (confidence: %.2f, prefetch: %d)",
inode, info.Pattern, info.Confidence, info.PrefetchSize)
return info
}
// detectPattern analyzes access patterns and updates confidence scores
func (apd *AccessPatternDetector) detectPattern(info *AccessInfo, offset int64, size int, now time.Time) {
if info.LastOffset == -1 {
// First access
info.Pattern = RandomAccess
info.Confidence = 0.5
return
}
gap := offset - (info.LastOffset + int64(info.LastSize))
// Sequential access detection
if gap >= 0 && gap <= apd.maxGapSize {
info.ConsecutiveSeq++
if info.ConsecutiveSeq >= apd.sequentialThreshold {
oldPattern := info.Pattern
info.Pattern = SequentialAccess
info.Confidence = minFloat(1.0, 0.1 + float64(info.ConsecutiveSeq) * 0.1)
// Calculate prefetch size for sequential access
if info.Pattern == SequentialAccess && oldPattern != SequentialAccess {
apd.sequentialReads++
// Start with 4x the current read size, capped at 1MB
info.PrefetchSize = minInt64(4 * int64(size), 1024*1024)
glog.V(3).Infof("Sequential pattern detected for inode %d, prefetch size: %d",
info.Inode, info.PrefetchSize)
}
}
} else {
// Reset sequential counter on non-sequential access
if info.ConsecutiveSeq > 0 {
info.ConsecutiveSeq = 0
if info.Pattern == SequentialAccess {
info.Pattern = RandomAccess
info.Confidence = 0.5
info.PrefetchSize = 0
glog.V(4).Infof("Sequential pattern broken for inode %d", info.Inode)
return // Don't check for other patterns after breaking sequential
}
}
apd.randomReads++
}
// ML-specific pattern detection
if apd.enableMLHeuristics {
apd.detectMLPatterns(info, offset, size, now)
}
// Adapt prefetch size based on access frequency
if info.Pattern == SequentialAccess && info.TotalAccesses > 10 {
timeSinceLastAccess := now.Sub(info.LastAccessTime)
if timeSinceLastAccess < 100*time.Millisecond {
// High frequency access, increase prefetch
info.PrefetchSize = minInt64(info.PrefetchSize * 2, 2*1024*1024) // Cap at 2MB
} else if timeSinceLastAccess > 5*time.Second {
// Low frequency access, decrease prefetch
info.PrefetchSize = maxInt64(info.PrefetchSize / 2, 64*1024) // Minimum 64KB
}
}
}
// detectMLPatterns detects ML-specific access patterns
func (apd *AccessPatternDetector) detectMLPatterns(info *AccessInfo, offset int64, size int, now time.Time) {
// Large file sequential reads often indicate model loading
if size > 1024*1024 && info.Pattern == SequentialAccess { // > 1MB reads
info.Pattern = ModelAccess
info.Confidence = 0.9
info.PrefetchSize = minInt64(8*1024*1024, info.PrefetchSize*4) // Aggressive prefetch for models
glog.V(3).Infof("Model access pattern detected for inode %d", info.Inode)
return
}
// Detect epoch restarts - same file accessed after a gap
if info.TotalAccesses > 100 && offset == 0 {
timeSinceLastAccess := now.Sub(info.LastAccessTime)
if timeSinceLastAccess > 1*time.Minute {
info.Pattern = EpochAccess
info.Confidence = 0.8
// For epoch access, prefetch aggressively at the beginning
info.PrefetchSize = minInt64(2*1024*1024, maxInt64(info.PrefetchSize, 256*1024))
glog.V(3).Infof("Epoch restart detected for inode %d", info.Inode)
return
}
}
// Detect strided access patterns (common with image datasets)
// Only detect strided access if we have enough accesses and it's not already sequential
if info.TotalAccesses > 3 && info.Pattern != SequentialAccess && apd.isStridedAccess(info, offset) {
info.Pattern = StridedAccess
info.Confidence = 0.7
// For strided access, prefetch based on stride size
info.PrefetchSize = minInt64(1024*1024, maxInt64(info.PrefetchSize, 128*1024))
glog.V(4).Infof("Strided access pattern detected for inode %d", info.Inode)
}
}
// isStridedAccess detects regular stride patterns in file access
func (apd *AccessPatternDetector) isStridedAccess(info *AccessInfo, offset int64) bool {
// This is a simplified implementation
// In a real implementation, we'd track multiple previous offsets to detect patterns
if info.TotalAccesses < 5 { // Require more accesses for stride detection
return false
}
// For now, just detect if there's a consistent gap size
// This would be expanded to track multiple stride patterns
expectedOffset := info.LastOffset + int64(info.LastSize)
if offset > expectedOffset {
gap := offset - expectedOffset
// If the gap is consistent and reasonable for image data
// Be more restrictive: gap should be in a reasonable range for strided access
if gap > 1024 && gap < 64*1024 { // Between 1KB and 64KB gap
return true
}
}
return false
}
// ShouldPrefetch determines if prefetching should be triggered for a file
func (apd *AccessPatternDetector) ShouldPrefetch(inode uint64) (bool, int64) {
apd.RLock()
defer apd.RUnlock()
info := apd.fileInfo[inode]
if info == nil {
return false, 0
}
// Only prefetch if we have high confidence in the pattern
if info.Confidence < apd.confidenceThreshold {
return false, 0
}
// Always prefetch for sequential and ML-specific patterns
switch info.Pattern {
case SequentialAccess, ModelAccess, EpochAccess:
return true, info.PrefetchSize
case StridedAccess:
// Be more conservative with strided access
return info.Confidence > 0.8, info.PrefetchSize
default:
return false, 0
}
}
// GetPattern returns the detected access pattern for a file
func (apd *AccessPatternDetector) GetPattern(inode uint64) AccessPattern {
apd.RLock()
defer apd.RUnlock()
info := apd.fileInfo[inode]
if info == nil {
return RandomAccess
}
return info.Pattern
}
// GetMetrics returns access pattern detection metrics
func (apd *AccessPatternDetector) GetMetrics() AccessPatternMetrics {
apd.RLock()
defer apd.RUnlock()
patterns := make(map[AccessPattern]int)
totalFiles := len(apd.fileInfo)
for _, info := range apd.fileInfo {
patterns[info.Pattern]++
}
return AccessPatternMetrics{
TotalAccesses: apd.totalAccesses,
SequentialReads: apd.sequentialReads,
RandomReads: apd.randomReads,
PrefetchTriggered: apd.prefetchTriggered,
TotalFiles: int64(totalFiles),
PatternCounts: patterns,
}
}
// AccessPatternMetrics holds metrics for access pattern detection
type AccessPatternMetrics struct {
TotalAccesses int64
SequentialReads int64
RandomReads int64
PrefetchTriggered int64
TotalFiles int64
PatternCounts map[AccessPattern]int
}
// addToHistory adds an access event to the global history
func (apd *AccessPatternDetector) addToHistory(event AccessEvent) {
if len(apd.recentAccesses) >= apd.maxHistory {
// Remove oldest entry (simple circular buffer)
copy(apd.recentAccesses, apd.recentAccesses[1:])
apd.recentAccesses = apd.recentAccesses[:len(apd.recentAccesses)-1]
}
apd.recentAccesses = append(apd.recentAccesses, event)
}
// CleanupOldEntries removes stale file access information
func (apd *AccessPatternDetector) CleanupOldEntries(maxAge time.Duration) {
apd.Lock()
defer apd.Unlock()
now := time.Now()
toDelete := make([]uint64, 0)
for inode, info := range apd.fileInfo {
if now.Sub(info.LastAccessTime) > maxAge {
toDelete = append(toDelete, inode)
}
}
for _, inode := range toDelete {
delete(apd.fileInfo, inode)
}
if len(toDelete) > 0 {
glog.V(3).Infof("Cleaned up %d old access pattern entries", len(toDelete))
}
}
// Helper functions
func minInt64(a, b int64) int64 {
if a < b {
return a
}
return b
}
func maxInt64(a, b int64) int64 {
if a > b {
return a
}
return b
}
func minFloat(a, b float64) float64 {
if a < b {
return a
}
return b
}
|