aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--weed/admin/task/admin_server.go699
-rw-r--r--weed/admin/task/admin_server_test.go524
-rw-r--r--weed/admin/task/compilation_stubs.go90
-rw-r--r--weed/admin/task/ec_integration_test.go309
-rw-r--r--weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go324
-rw-r--r--weed/admin/task/ec_test_standalone/go.mod3
-rw-r--r--weed/admin/task/ec_test_standalone/minimal_admin_server.go324
-rw-r--r--weed/admin/task/ec_test_standalone/minimal_integration_test.go434
-rw-r--r--weed/admin/task/ec_worker_test.go488
-rw-r--r--weed/admin/task/example_usage.go346
-rw-r--r--weed/admin/task/failure_handler.go123
-rw-r--r--weed/admin/task/master_sync.go486
-rw-r--r--weed/admin/task/minimal_admin_server.go324
-rw-r--r--weed/admin/task/minimal_integration_test.go434
-rw-r--r--weed/admin/task/operational_integration_test.go197
-rw-r--r--weed/admin/task/simple_integration_test.go233
-rw-r--r--weed/admin/task/simulation.go604
-rw-r--r--weed/admin/task/simulation/comprehensive_simulation.go695
-rw-r--r--weed/admin/task/simulation/comprehensive_simulation_test.go444
-rw-r--r--weed/admin/task/simulation/simulation_runner.go294
-rw-r--r--weed/admin/task/simulation/system_demo_test.go237
-rw-r--r--weed/admin/task/task_assignment_test.go509
-rw-r--r--weed/admin/task/task_detectors.go168
-rw-r--r--weed/admin/task/task_discovery.go161
-rw-r--r--weed/admin/task/task_scheduler.go257
-rw-r--r--weed/admin/task/task_types.go68
-rw-r--r--weed/admin/task/volume_state_manager.go640
-rw-r--r--weed/admin/task/volume_state_manager_test.go440
-rw-r--r--weed/admin/task/volume_state_tracker.go226
-rw-r--r--weed/admin/task/worker_communication.go488
-rw-r--r--weed/admin/task/worker_registry.go348
-rw-r--r--weed/admin/task_minimal/admin_server.go324
-rw-r--r--weed/admin/task_minimal/go.mod3
-rw-r--r--weed/admin/task_minimal/integration_test.go233
34 files changed, 0 insertions, 11477 deletions
diff --git a/weed/admin/task/admin_server.go b/weed/admin/task/admin_server.go
deleted file mode 100644
index f5e2eaa62..000000000
--- a/weed/admin/task/admin_server.go
+++ /dev/null
@@ -1,699 +0,0 @@
-package task
-
-import (
- "fmt"
- "math/rand"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TaskHistory represents task execution history
-type TaskHistory struct {
- entries []TaskHistoryEntry
- mutex sync.RWMutex
-}
-
-// TaskHistoryEntry represents a single task history entry
-type TaskHistoryEntry struct {
- TaskID string
- TaskType types.TaskType
- VolumeID uint32
- WorkerID string
- Status types.TaskStatus
- StartedAt time.Time
- CompletedAt time.Time
- Duration time.Duration
- ErrorMessage string
-}
-
-// NewTaskHistory creates a new task history
-func NewTaskHistory() *TaskHistory {
- return &TaskHistory{
- entries: make([]TaskHistoryEntry, 0),
- }
-}
-
-// AddEntry adds a new task history entry
-func (th *TaskHistory) AddEntry(entry TaskHistoryEntry) {
- th.mutex.Lock()
- defer th.mutex.Unlock()
-
- th.entries = append(th.entries, entry)
-
- // Keep only the last 1000 entries
- if len(th.entries) > 1000 {
- th.entries = th.entries[len(th.entries)-1000:]
- }
-}
-
-// GetRecentEntries returns the most recent entries
-func (th *TaskHistory) GetRecentEntries(limit int) []*TaskHistoryEntry {
- th.mutex.RLock()
- defer th.mutex.RUnlock()
-
- start := len(th.entries) - limit
- if start < 0 {
- start = 0
- }
-
- result := make([]*TaskHistoryEntry, len(th.entries)-start)
- for i, entry := range th.entries[start:] {
- entryCopy := entry
- result[i] = &entryCopy
- }
-
- return result
-}
-
-// AdminServer manages task distribution and worker coordination
-type AdminServer struct {
- ID string
- Config *AdminConfig
- masterClient *wdclient.MasterClient
- volumeStateManager *VolumeStateManager
- workerRegistry *WorkerRegistry
- taskQueue *PriorityTaskQueue
- taskScheduler *TaskScheduler
- taskHistory *TaskHistory
- failureHandler *FailureHandler
- masterSync *MasterSynchronizer
- workerComm *WorkerCommunicationManager
- running bool
- stopCh chan struct{}
- mutex sync.RWMutex
-
- // Task tracking
- activeTasks map[string]*InProgressTask
- tasksMutex sync.RWMutex
-}
-
-// AdminConfig holds configuration for the admin server
-type AdminConfig struct {
- ScanInterval time.Duration
- WorkerTimeout time.Duration
- TaskTimeout time.Duration
- MaxRetries int
- ReconcileInterval time.Duration
- EnableFailureRecovery bool
- MaxConcurrentTasks int
-}
-
-// NewAdminServer creates a new admin server instance
-func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer {
- adminServer := &AdminServer{
- ID: generateAdminServerID(),
- Config: config,
- masterClient: masterClient,
- volumeStateManager: NewVolumeStateManager(masterClient),
- workerRegistry: NewWorkerRegistry(),
- taskQueue: NewPriorityTaskQueue(),
- taskHistory: NewTaskHistory(),
- failureHandler: NewFailureHandler(config),
- activeTasks: make(map[string]*InProgressTask),
- stopCh: make(chan struct{}),
- }
-
- // Initialize components that depend on admin server
- adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue)
- adminServer.masterSync = NewMasterSynchronizer(masterClient, adminServer.volumeStateManager, adminServer)
- adminServer.workerComm = NewWorkerCommunicationManager(adminServer)
-
- glog.Infof("Created admin server %s", adminServer.ID)
- return adminServer
-}
-
-// Start starts the admin server
-func (as *AdminServer) Start() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if as.running {
- return nil
- }
-
- glog.Infof("Starting admin server %s", as.ID)
-
- // Start components
- as.masterSync.Start()
- as.workerComm.Start()
-
- // Start background loops
- go as.taskAssignmentLoop()
- go as.taskMonitoringLoop()
- go as.reconciliationLoop()
- go as.metricsLoop()
-
- as.running = true
- glog.Infof("Admin server %s started successfully", as.ID)
-
- return nil
-}
-
-// Stop stops the admin server
-func (as *AdminServer) Stop() {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return
- }
-
- glog.Infof("Stopping admin server %s", as.ID)
-
- close(as.stopCh)
-
- // Stop components
- as.masterSync.Stop()
- as.workerComm.Stop()
-
- as.running = false
- glog.Infof("Admin server %s stopped", as.ID)
-}
-
-// RegisterWorker registers a new worker
-func (as *AdminServer) RegisterWorker(worker *types.Worker) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- return as.workerRegistry.RegisterWorker(worker)
-}
-
-// UnregisterWorker removes a worker
-func (as *AdminServer) UnregisterWorker(workerID string) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- // Reschedule any tasks assigned to this worker
- for taskID, task := range as.activeTasks {
- if task.WorkerID == workerID {
- glog.Warningf("Rescheduling task %s due to worker %s unregistration", taskID, workerID)
- as.ReassignTask(taskID, "worker unregistration")
- delete(as.activeTasks, taskID)
- }
- }
-
- return as.workerRegistry.UnregisterWorker(workerID)
-}
-
-// UpdateWorkerHeartbeat updates worker heartbeat
-func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- return as.workerRegistry.UpdateWorkerHeartbeat(workerID, status)
-}
-
-// RequestTask handles task requests from workers
-func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- if !as.running {
- return nil, fmt.Errorf("admin server is not running")
- }
-
- worker, exists := as.workerRegistry.GetWorker(workerID)
- if !exists {
- return nil, fmt.Errorf("worker %s not registered", workerID)
- }
-
- // Check if worker has capacity
- if worker.CurrentLoad >= worker.MaxConcurrent {
- return nil, nil // No capacity
- }
-
- // Get next task for this worker
- task := as.taskScheduler.GetNextTask(workerID, capabilities)
- if task == nil {
- return nil, nil // No suitable tasks
- }
-
- // Check if volume can be assigned (using comprehensive state management)
- if !as.canAssignTask(task, workerID) {
- return nil, nil // Cannot assign due to capacity or state constraints
- }
-
- // Assign task to worker
- inProgressTask := &InProgressTask{
- Task: task,
- WorkerID: workerID,
- StartedAt: time.Now(),
- LastUpdate: time.Now(),
- Progress: 0.0,
- EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)),
- }
-
- as.activeTasks[task.ID] = inProgressTask
- worker.CurrentLoad++
-
- // Register task impact with state manager
- impact := as.createTaskImpact(task)
- as.volumeStateManager.RegisterTaskImpact(task.ID, impact)
- inProgressTask.VolumeReserved = true
-
- glog.V(1).Infof("Assigned task %s to worker %s", task.ID, workerID)
- return task, nil
-}
-
-// UpdateTaskProgress updates task progress
-func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error {
- as.tasksMutex.Lock()
- defer as.tasksMutex.Unlock()
-
- inProgressTask, exists := as.activeTasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- inProgressTask.Progress = progress
- inProgressTask.LastUpdate = time.Now()
-
- glog.V(2).Infof("Task %s progress: %.1f%%", taskID, progress)
- return nil
-}
-
-// CompleteTask marks a task as completed
-func (as *AdminServer) CompleteTask(taskID string, success bool, errorMsg string) error {
- as.tasksMutex.Lock()
- defer as.tasksMutex.Unlock()
-
- inProgressTask, exists := as.activeTasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- // Remove from active tasks
- delete(as.activeTasks, taskID)
-
- // Update worker load
- if worker, exists := as.workerRegistry.GetWorker(inProgressTask.WorkerID); exists {
- worker.CurrentLoad--
- }
-
- // Unregister task impact
- as.volumeStateManager.UnregisterTaskImpact(taskID)
-
- // Record in task history
- status := types.TaskStatusCompleted
- if !success {
- status = types.TaskStatusFailed
- }
-
- as.taskHistory.AddEntry(TaskHistoryEntry{
- TaskID: taskID,
- TaskType: inProgressTask.Task.Type,
- VolumeID: inProgressTask.Task.VolumeID,
- WorkerID: inProgressTask.WorkerID,
- Status: status,
- StartedAt: inProgressTask.StartedAt,
- CompletedAt: time.Now(),
- Duration: time.Since(inProgressTask.StartedAt),
- ErrorMessage: errorMsg,
- })
-
- glog.Infof("Task %s completed: success=%v", taskID, success)
- return nil
-}
-
-// QueueTask adds a new task to the task queue
-func (as *AdminServer) QueueTask(task *types.Task) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- // Validate the task
- if task == nil {
- return fmt.Errorf("task cannot be nil")
- }
-
- if task.ID == "" {
- task.ID = generateTaskID()
- }
-
- // Set creation timestamp if not set
- if task.CreatedAt.IsZero() {
- task.CreatedAt = time.Now()
- }
-
- // Check if task for this volume is already queued or in progress
- if as.isVolumeAlreadyQueued(task.VolumeID, task.Type) {
- glog.V(2).Infof("Task for volume %d already queued or in progress, skipping", task.VolumeID)
- return nil
- }
-
- // Add to task queue
- as.taskQueue.Push(task)
-
- glog.V(1).Infof("Queued task %s (%s) for volume %d with priority %v",
- task.ID, task.Type, task.VolumeID, task.Priority)
-
- return nil
-}
-
-// Helper methods
-
-// canAssignTask checks if a task can be assigned to a worker
-func (as *AdminServer) canAssignTask(task *types.Task, workerID string) bool {
- worker, exists := as.workerRegistry.GetWorker(workerID)
- if !exists {
- return false
- }
-
- // Check worker capacity
- if worker.CurrentLoad >= worker.MaxConcurrent {
- return false
- }
-
- // Check if worker has required capability
- hasCapability := false
- for _, cap := range worker.Capabilities {
- if cap == task.Type {
- hasCapability = true
- break
- }
- }
- if !hasCapability {
- return false
- }
-
- return true
-}
-
-// createTaskImpact creates a TaskImpact for the given task
-func (as *AdminServer) createTaskImpact(task *types.Task) *TaskImpact {
- impact := &TaskImpact{
- TaskID: task.ID,
- VolumeID: task.VolumeID,
- TaskType: task.Type,
- StartedAt: time.Now(),
- EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)),
- CapacityDelta: make(map[string]int64),
- VolumeChanges: &VolumeChanges{},
- ShardChanges: make(map[int]*ShardChange),
- }
-
- // Set task-specific impacts
- switch task.Type {
- case types.TaskTypeErasureCoding:
- impact.VolumeChanges.WillBecomeReadOnly = true
- impact.EstimatedEnd = time.Now().Add(2 * time.Hour) // EC takes longer
-
- // EC encoding requires temporary space
- if server, ok := task.Parameters["server"]; ok {
- if serverStr, ok := server.(string); ok {
- volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID)
- if volumeState != nil && volumeState.CurrentState != nil {
- // Estimate 2x volume size needed temporarily
- impact.CapacityDelta[serverStr] = int64(volumeState.CurrentState.Size * 2)
- }
- }
- }
-
- case types.TaskTypeVacuum:
- // Vacuum reduces volume size
- if server, ok := task.Parameters["server"]; ok {
- if serverStr, ok := server.(string); ok {
- // Estimate 30% space reclamation
- volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID)
- if volumeState != nil && volumeState.CurrentState != nil {
- impact.CapacityDelta[serverStr] = -int64(float64(volumeState.CurrentState.Size) * 0.3)
- }
- }
- }
- }
-
- return impact
-}
-
-// estimateTaskDuration estimates how long a task will take
-func (as *AdminServer) estimateTaskDuration(task *types.Task) time.Duration {
- switch task.Type {
- case types.TaskTypeErasureCoding:
- return 2 * time.Hour
- case types.TaskTypeVacuum:
- return 30 * time.Minute
- default:
- return 1 * time.Hour
- }
-}
-
-// isVolumeAlreadyQueued checks if a task for the volume is already queued or in progress
-func (as *AdminServer) isVolumeAlreadyQueued(volumeID uint32, taskType types.TaskType) bool {
- // Check active tasks
- as.tasksMutex.RLock()
- for _, inProgressTask := range as.activeTasks {
- if inProgressTask.Task.VolumeID == volumeID && inProgressTask.Task.Type == taskType {
- as.tasksMutex.RUnlock()
- return true
- }
- }
- as.tasksMutex.RUnlock()
-
- // Check queued tasks
- return as.taskQueue.HasTask(volumeID, taskType)
-}
-
-// Background loops
-
-// taskAssignmentLoop handles automatic task assignment to workers
-func (as *AdminServer) taskAssignmentLoop() {
- ticker := time.NewTicker(5 * time.Second)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- as.processTaskAssignments()
- case <-as.stopCh:
- return
- }
- }
-}
-
-// processTaskAssignments attempts to assign pending tasks to available workers
-func (as *AdminServer) processTaskAssignments() {
- // Get available workers
- workers := as.workerRegistry.GetAvailableWorkers()
- if len(workers) == 0 {
- return // No workers available
- }
-
- // For each worker with available capacity, try to assign a task
- for _, worker := range workers {
- if worker.CurrentLoad < worker.MaxConcurrent {
- task := as.taskScheduler.GetNextTask(worker.ID, worker.Capabilities)
- if task != nil {
- // Try to assign task directly
- _, err := as.RequestTask(worker.ID, worker.Capabilities)
- if err != nil {
- glog.Errorf("Failed to assign task to worker %s: %v", worker.ID, err)
- }
- }
- }
- }
-}
-
-// taskMonitoringLoop monitors task progress and handles timeouts
-func (as *AdminServer) taskMonitoringLoop() {
- ticker := time.NewTicker(1 * time.Minute)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- as.checkTaskTimeouts()
- case <-as.stopCh:
- return
- }
- }
-}
-
-// checkTaskTimeouts checks for tasks that have timed out
-func (as *AdminServer) checkTaskTimeouts() {
- as.tasksMutex.Lock()
- defer as.tasksMutex.Unlock()
-
- now := time.Now()
- timeout := 2 * time.Hour // Default task timeout
-
- for taskID, inProgressTask := range as.activeTasks {
- if now.Sub(inProgressTask.LastUpdate) > timeout {
- glog.Warningf("Task %s timed out (last update: %v)", taskID, inProgressTask.LastUpdate)
- as.ReassignTask(taskID, "task timeout")
- }
- }
-}
-
-// ReassignTask reassigns a task due to worker failure
-func (as *AdminServer) ReassignTask(taskID, reason string) {
- as.tasksMutex.Lock()
- defer as.tasksMutex.Unlock()
-
- inProgressTask, exists := as.activeTasks[taskID]
- if !exists {
- return
- }
-
- glog.Infof("Reassigning task %s due to: %s", taskID, reason)
-
- // Reset task status
- inProgressTask.Task.Status = types.TaskStatusPending
-
- // Unregister current task impact
- as.volumeStateManager.UnregisterTaskImpact(taskID)
-
- // Remove from active tasks
- delete(as.activeTasks, taskID)
-
- // Put back in queue with higher priority
- inProgressTask.Task.Priority = types.TaskPriorityHigh
- as.taskQueue.Push(inProgressTask.Task)
-}
-
-// reconciliationLoop periodically reconciles state with master
-func (as *AdminServer) reconciliationLoop() {
- ticker := time.NewTicker(5 * time.Minute)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- as.performReconciliation()
- case <-as.stopCh:
- return
- }
- }
-}
-
-// performReconciliation reconciles admin state with master
-func (as *AdminServer) performReconciliation() {
- glog.V(1).Infof("Starting state reconciliation")
-
- // Sync with master
- err := as.volumeStateManager.SyncWithMaster()
- if err != nil {
- glog.Errorf("Failed to sync with master during reconciliation: %v", err)
- return
- }
-
- glog.V(1).Infof("State reconciliation completed")
-}
-
-// metricsLoop periodically logs metrics and statistics
-func (as *AdminServer) metricsLoop() {
- ticker := time.NewTicker(1 * time.Minute)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- as.logMetrics()
- case <-as.stopCh:
- return
- }
- }
-}
-
-// logMetrics logs current system metrics
-func (as *AdminServer) logMetrics() {
- as.tasksMutex.RLock()
- activeTasks := len(as.activeTasks)
- as.tasksMutex.RUnlock()
-
- queuedTasks := as.taskQueue.Size()
- activeWorkers := len(as.workerRegistry.GetAvailableWorkers())
-
- glog.V(1).Infof("Admin server metrics: active_tasks=%d, queued_tasks=%d, active_workers=%d",
- activeTasks, queuedTasks, activeWorkers)
-}
-
-// GetAvailableWorkers returns workers capable of handling the specified task type
-func (as *AdminServer) GetAvailableWorkers(taskType string) []*types.Worker {
- workers := as.workerRegistry.GetAvailableWorkers()
- var available []*types.Worker
-
- for _, worker := range workers {
- if worker.CurrentLoad < worker.MaxConcurrent {
- for _, cap := range worker.Capabilities {
- if string(cap) == taskType {
- available = append(available, worker)
- break
- }
- }
- }
- }
-
- return available
-}
-
-// GetSystemStats returns current system statistics
-func (as *AdminServer) GetSystemStats() *SystemStats {
- as.tasksMutex.RLock()
- activeTasks := len(as.activeTasks)
- as.tasksMutex.RUnlock()
-
- queuedTasks := as.taskQueue.Size()
- activeWorkers := len(as.workerRegistry.GetAvailableWorkers())
-
- return &SystemStats{
- ActiveTasks: activeTasks,
- QueuedTasks: queuedTasks,
- ActiveWorkers: activeWorkers,
- TotalWorkers: len(as.workerRegistry.GetAvailableWorkers()),
- Uptime: time.Since(time.Now()), // This should be tracked properly
- }
-}
-
-// Getter methods for testing
-func (as *AdminServer) GetQueuedTaskCount() int {
- return as.taskQueue.Size()
-}
-
-func (as *AdminServer) GetActiveTaskCount() int {
- as.tasksMutex.RLock()
- defer as.tasksMutex.RUnlock()
- return len(as.activeTasks)
-}
-
-func (as *AdminServer) GetTaskHistory() []*TaskHistoryEntry {
- return as.taskHistory.GetRecentEntries(100)
-}
-
-func (as *AdminServer) GetVolumeStateManager() *VolumeStateManager {
- return as.volumeStateManager
-}
-
-func (as *AdminServer) GetWorkerRegistry() *WorkerRegistry {
- return as.workerRegistry
-}
-
-// generateTaskID generates a unique task ID
-func generateTaskID() string {
- return fmt.Sprintf("task_%d_%d", time.Now().UnixNano(), rand.Intn(10000))
-}
-
-// generateAdminServerID generates a unique admin server ID
-func generateAdminServerID() string {
- return fmt.Sprintf("admin-%d", time.Now().Unix())
-}
-
-// SystemStats represents system statistics
-type SystemStats struct {
- ActiveTasks int
- QueuedTasks int
- ActiveWorkers int
- TotalWorkers int
- Uptime time.Duration
- LastMasterSync time.Time
-}
diff --git a/weed/admin/task/admin_server_test.go b/weed/admin/task/admin_server_test.go
deleted file mode 100644
index 3862cf48d..000000000
--- a/weed/admin/task/admin_server_test.go
+++ /dev/null
@@ -1,524 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-func TestAdminServer_TaskAssignmentWithStateManagement(t *testing.T) {
- // Test the core functionality: accurate task assignment based on comprehensive state
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
-
- // Initialize components
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.taskQueue = NewPriorityTaskQueue()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue)
- adminServer.running = true // Mark as running for test
-
- // Setup test worker
- worker := &types.Worker{
- ID: "test_worker_1",
- Address: "server1:8080",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- }
- adminServer.workerRegistry.RegisterWorker(worker)
-
- // Setup volume state
- volumeID := uint32(1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 28 * 1024 * 1024 * 1024, // 28GB - good for EC
- Server: "server1",
- },
- InProgressTasks: []*TaskImpact{},
- PlannedChanges: []*PlannedOperation{},
- }
-
- // Setup server capacity
- adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{
- Server: "server1",
- TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
- UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
- PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
- }
-
- // Create EC task
- task := &types.Task{
- ID: "ec_task_1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: volumeID,
- Server: "server1",
- Priority: types.TaskPriorityNormal,
- }
-
- // Test task assignment
- adminServer.taskQueue.Push(task)
-
- assignedTask, err := adminServer.RequestTask("test_worker_1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Task assignment failed: %v", err)
- }
-
- if assignedTask == nil {
- t.Fatal("Expected task to be assigned, got nil")
- }
-
- if assignedTask.ID != "ec_task_1" {
- t.Errorf("Expected task ec_task_1, got %s", assignedTask.ID)
- }
-
- // Verify state manager was updated
- if len(adminServer.volumeStateManager.inProgressTasks) != 1 {
- t.Errorf("Expected 1 in-progress task in state manager, got %d", len(adminServer.volumeStateManager.inProgressTasks))
- }
-
- // Verify capacity reservation
- capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1")
- if capacity.ReservedCapacity <= 0 {
- t.Error("Expected capacity to be reserved for EC task")
- }
-
- t.Log("✅ Task assignment with state management test passed")
-}
-
-func TestAdminServer_CanAssignTask(t *testing.T) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
-
- // Setup volume state
- volumeID := uint32(1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 25 * 1024 * 1024 * 1024, // 25GB
- },
- }
-
- // Setup server capacity - limited space
- serverID := "server1"
- adminServer.volumeStateManager.capacityCache[serverID] = &CapacityInfo{
- Server: serverID,
- TotalCapacity: 30 * 1024 * 1024 * 1024, // 30GB total
- UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
- PredictedUsage: 20 * 1024 * 1024 * 1024, // 10GB available
- }
-
- worker := &types.Worker{
- ID: "worker1",
- Address: serverID,
- }
-
- tests := []struct {
- name string
- taskType types.TaskType
- expected bool
- desc string
- }{
- {
- name: "EC task fits",
- taskType: types.TaskTypeErasureCoding,
- expected: false, // 25GB * 1.4 = 35GB needed, but only 10GB available
- desc: "EC task should not fit due to insufficient capacity",
- },
- {
- name: "Vacuum task fits",
- taskType: types.TaskTypeVacuum,
- expected: true,
- desc: "Vacuum task should fit (no capacity increase)",
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- task := &types.Task{
- ID: "test_task",
- Type: tt.taskType,
- VolumeID: volumeID,
- Server: serverID,
- }
-
- result := adminServer.canAssignTask(task, worker)
- if result != tt.expected {
- t.Errorf("canAssignTask() = %v, want %v. %s", result, tt.expected, tt.desc)
- }
- })
- }
-}
-
-func TestAdminServer_CreateTaskImpact(t *testing.T) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
-
- // Setup volume state for EC task
- volumeID := uint32(1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 25 * 1024 * 1024 * 1024, // 25GB
- },
- }
-
- task := &types.Task{
- ID: "ec_task_1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: volumeID,
- Server: "server1",
- }
-
- impact := adminServer.createTaskImpact(task, "worker1")
-
- // Verify impact structure
- if impact.TaskID != "ec_task_1" {
- t.Errorf("Expected task ID ec_task_1, got %s", impact.TaskID)
- }
-
- if impact.TaskType != types.TaskTypeErasureCoding {
- t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, impact.TaskType)
- }
-
- // Verify volume changes for EC task
- if !impact.VolumeChanges.WillBecomeReadOnly {
- t.Error("Expected volume to become read-only after EC")
- }
-
- // Verify capacity delta (EC should require ~40% more space)
- expectedCapacity := int64(float64(25*1024*1024*1024) * 1.4) // ~35GB
- actualCapacity := impact.CapacityDelta["server1"]
- if actualCapacity != expectedCapacity {
- t.Errorf("Expected capacity delta %d, got %d", expectedCapacity, actualCapacity)
- }
-
- // Verify shard changes (should plan 14 shards)
- if len(impact.ShardChanges) != 14 {
- t.Errorf("Expected 14 shard changes, got %d", len(impact.ShardChanges))
- }
-
- for i := 0; i < 14; i++ {
- shardChange := impact.ShardChanges[i]
- if shardChange == nil {
- t.Errorf("Missing shard change for shard %d", i)
- continue
- }
-
- if !shardChange.WillBeCreated {
- t.Errorf("Shard %d should be marked for creation", i)
- }
- }
-
- t.Log("✅ Task impact creation test passed")
-}
-
-func TestAdminServer_TaskCompletionStateCleanup(t *testing.T) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
-
- // Setup worker
- worker := &types.Worker{
- ID: "worker1",
- CurrentLoad: 1, // Has 1 task assigned
- }
- adminServer.workerRegistry.RegisterWorker(worker)
-
- // Setup in-progress task
- task := &types.Task{
- ID: "test_task_1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1,
- }
-
- inProgressTask := &InProgressTask{
- Task: task,
- WorkerID: "worker1",
- VolumeReserved: true,
- }
- adminServer.inProgressTasks["test_task_1"] = inProgressTask
-
- // Register impact in state manager
- impact := &TaskImpact{
- TaskID: "test_task_1",
- VolumeID: 1,
- CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
- }
- adminServer.volumeStateManager.RegisterTaskImpact("test_task_1", impact)
-
- // Complete the task
- err := adminServer.CompleteTask("test_task_1", true, "")
- if err != nil {
- t.Errorf("Task completion failed: %v", err)
- }
-
- // Verify cleanup
- if len(adminServer.inProgressTasks) != 0 {
- t.Errorf("Expected 0 in-progress tasks after completion, got %d", len(adminServer.inProgressTasks))
- }
-
- // Verify worker load updated
- updatedWorker, _ := adminServer.workerRegistry.GetWorker("worker1")
- if updatedWorker.CurrentLoad != 0 {
- t.Errorf("Expected worker load 0 after task completion, got %d", updatedWorker.CurrentLoad)
- }
-
- // Verify state manager cleaned up
- if len(adminServer.volumeStateManager.inProgressTasks) != 0 {
- t.Errorf("Expected 0 tasks in state manager after completion, got %d", len(adminServer.volumeStateManager.inProgressTasks))
- }
-
- t.Log("✅ Task completion state cleanup test passed")
-}
-
-func TestAdminServer_PreventDuplicateTaskAssignment(t *testing.T) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.taskQueue = NewPriorityTaskQueue()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
-
- // Setup worker
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- }
- adminServer.workerRegistry.RegisterWorker(worker)
-
- // Setup volume state
- volumeID := uint32(1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
- }
-
- // Create first task and assign it
- task1 := &types.Task{
- ID: "vacuum_task_1",
- Type: types.TaskTypeVacuum,
- VolumeID: volumeID,
- Priority: types.TaskPriorityNormal,
- }
-
- adminServer.taskQueue.Push(task1)
- assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil || assignedTask1 == nil {
- t.Fatal("First task assignment failed")
- }
-
- // Try to assign another vacuum task for the same volume
- task2 := &types.Task{
- ID: "vacuum_task_2",
- Type: types.TaskTypeVacuum,
- VolumeID: volumeID, // Same volume!
- Priority: types.TaskPriorityNormal,
- }
-
- adminServer.taskQueue.Push(task2)
- assignedTask2, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum})
-
- // Should not assign duplicate task
- if assignedTask2 != nil {
- t.Error("Should not assign duplicate vacuum task for same volume")
- }
-
- t.Log("✅ Duplicate task prevention test passed")
-}
-
-func TestAdminServer_SystemStats(t *testing.T) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.taskQueue = NewPriorityTaskQueue()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
- adminServer.running = true
-
- // Add some test data
- worker := &types.Worker{ID: "worker1", Status: "active"}
- adminServer.workerRegistry.RegisterWorker(worker)
-
- task := &types.Task{ID: "task1", Type: types.TaskTypeErasureCoding}
- adminServer.taskQueue.Push(task)
-
- inProgressTask := &InProgressTask{
- Task: &types.Task{ID: "task2", Type: types.TaskTypeVacuum},
- }
- adminServer.inProgressTasks["task2"] = inProgressTask
-
- // Get system stats
- stats := adminServer.GetSystemStats()
-
- // Verify stats structure
- if !stats["running"].(bool) {
- t.Error("Expected running to be true")
- }
-
- if stats["in_progress_tasks"].(int) != 1 {
- t.Errorf("Expected 1 in-progress task, got %d", stats["in_progress_tasks"].(int))
- }
-
- if stats["queued_tasks"].(int) != 1 {
- t.Errorf("Expected 1 queued task, got %d", stats["queued_tasks"].(int))
- }
-
- // Check task breakdown
- tasksByType := stats["tasks_by_type"].(map[types.TaskType]int)
- if tasksByType[types.TaskTypeVacuum] != 1 {
- t.Errorf("Expected 1 vacuum task, got %d", tasksByType[types.TaskTypeVacuum])
- }
-
- t.Log("✅ System stats test passed")
-}
-
-func TestAdminServer_VolumeStateIntegration(t *testing.T) {
- // Integration test: Verify admin server correctly uses volume state for decisions
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.taskQueue = NewPriorityTaskQueue()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
-
- // Setup worker
- worker := &types.Worker{
- ID: "worker1",
- Address: "server1",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- }
- adminServer.workerRegistry.RegisterWorker(worker)
-
- // Setup volume and capacity that would normally allow EC
- volumeID := uint32(1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 25 * 1024 * 1024 * 1024, // 25GB
- Server: "server1",
- },
- }
-
- adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{
- Server: "server1",
- TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
- UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
- PredictedUsage: 20 * 1024 * 1024 * 1024, // 80GB available
- }
-
- // Create EC task
- task := &types.Task{
- ID: "ec_task_1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: volumeID,
- Server: "server1",
- }
-
- adminServer.taskQueue.Push(task)
-
- // First assignment should work
- assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil || assignedTask1 == nil {
- t.Fatal("First EC task assignment should succeed")
- }
-
- // Verify capacity is now reserved
- capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1")
- if capacity.ReservedCapacity <= 0 {
- t.Error("Expected capacity to be reserved for first EC task")
- }
-
- // Try to assign another large EC task - should fail due to capacity
- volumeID2 := uint32(2)
- adminServer.volumeStateManager.volumes[volumeID2] = &VolumeState{
- VolumeID: volumeID2,
- CurrentState: &VolumeInfo{
- ID: volumeID2,
- Size: 30 * 1024 * 1024 * 1024, // 30GB - would need 42GB for EC
- Server: "server1",
- },
- }
-
- task2 := &types.Task{
- ID: "ec_task_2",
- Type: types.TaskTypeErasureCoding,
- VolumeID: volumeID2,
- Server: "server1",
- }
-
- adminServer.taskQueue.Push(task2)
-
- // Add another worker to test capacity-based rejection
- worker2 := &types.Worker{
- ID: "worker2",
- Address: "server1",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- }
- adminServer.workerRegistry.RegisterWorker(worker2)
-
- assignedTask2, err := adminServer.RequestTask("worker2", []types.TaskType{types.TaskTypeErasureCoding})
-
- // Should not assign due to insufficient capacity
- if assignedTask2 != nil {
- t.Error("Should not assign second EC task due to insufficient server capacity")
- }
-
- t.Log("✅ Volume state integration test passed")
- t.Log("✅ Admin server correctly uses comprehensive state for task assignment decisions")
-}
-
-// Benchmark for task assignment performance
-func BenchmarkAdminServer_RequestTask(b *testing.B) {
- adminServer := NewAdminServer(DefaultAdminConfig(), nil)
- adminServer.workerRegistry = NewWorkerRegistry()
- adminServer.taskQueue = NewPriorityTaskQueue()
- adminServer.volumeStateManager = NewVolumeStateManager(nil)
- adminServer.inProgressTasks = make(map[string]*InProgressTask)
-
- // Setup worker
- worker := &types.Worker{
- ID: "bench_worker",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1000, // High limit for benchmark
- Status: "active",
- CurrentLoad: 0,
- }
- adminServer.workerRegistry.RegisterWorker(worker)
-
- // Setup many tasks
- for i := 0; i < 1000; i++ {
- volumeID := uint32(i + 1)
- adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
- }
-
- task := &types.Task{
- ID: fmt.Sprintf("task_%d", i),
- Type: types.TaskTypeVacuum,
- VolumeID: volumeID,
- }
- adminServer.taskQueue.Push(task)
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- adminServer.RequestTask("bench_worker", []types.TaskType{types.TaskTypeVacuum})
- }
-}
diff --git a/weed/admin/task/compilation_stubs.go b/weed/admin/task/compilation_stubs.go
deleted file mode 100644
index 2c90361dd..000000000
--- a/weed/admin/task/compilation_stubs.go
+++ /dev/null
@@ -1,90 +0,0 @@
-package task
-
-import (
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// Compilation stubs for missing types and functions
-
-// Task is an alias for types.Task for backward compatibility
-type Task = types.Task
-
-// TaskType is an alias for types.TaskType for backward compatibility
-type TaskType = types.TaskType
-
-// TaskStatus is an alias for types.TaskStatus for backward compatibility
-type TaskStatus = types.TaskStatus
-
-// TaskPriority is an alias for types.TaskPriority for backward compatibility
-type TaskPriority = types.TaskPriority
-
-// Additional type aliases for compilation
-var (
- TaskStatusCompleted = types.TaskStatusCompleted
- TaskStatusFailed = types.TaskStatusFailed
-)
-
-// Worker represents a worker node
-type Worker struct {
- ID string
- Address string
- Capabilities []string
- Status string
- LastSeen time.Time
-}
-
-// convertAdminToWorkerMessage converts AdminMessage to WorkerMessage for stream compatibility
-func convertAdminToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage {
- // This is a workaround for the stream type mismatch
- // In a real implementation, this would need proper message conversion
- return &worker_pb.WorkerMessage{
- WorkerId: msg.AdminId,
- Timestamp: msg.Timestamp,
- // Add basic message conversion logic here
- }
-}
-
-// WorkerRegistry stub methods
-func (wr *WorkerRegistry) UpdateWorkerStatus(workerID string, status interface{}) {
- // Stub implementation
-}
-
-// AdminServer stub methods
-func (as *AdminServer) AssignTaskToWorker(workerID string) *Task {
- // Stub implementation
- return nil
-}
-
-// DefaultAdminConfig returns default admin server configuration
-func DefaultAdminConfig() *AdminConfig {
- return &AdminConfig{
- ScanInterval: 30 * time.Minute,
- WorkerTimeout: 5 * time.Minute,
- TaskTimeout: 10 * time.Minute,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 10,
- }
-}
-
-// SyncWithMasterData is a stub for the volume state manager
-func (vsm *VolumeStateManager) SyncWithMasterData(volumes map[uint32]*VolumeInfo, ecShards map[uint32]map[int]*ShardInfo, serverCapacity map[string]*CapacityInfo) error {
- // Stub implementation - would normally sync the data
- return nil
-}
-
-// GetAllVolumeStates is a stub for the volume state manager
-func (vsm *VolumeStateManager) GetAllVolumeStates() map[uint32]*VolumeState {
- // Stub implementation - return empty map
- return make(map[uint32]*VolumeState)
-}
-
-// DetectInconsistencies is a stub for the volume state manager
-func (vsm *VolumeStateManager) DetectInconsistencies() []StateInconsistency {
- // Stub implementation - return empty slice
- return []StateInconsistency{}
-}
diff --git a/weed/admin/task/ec_integration_test.go b/weed/admin/task/ec_integration_test.go
deleted file mode 100644
index d614495c0..000000000
--- a/weed/admin/task/ec_integration_test.go
+++ /dev/null
@@ -1,309 +0,0 @@
-package task
-
-import (
- "os"
- "path/filepath"
- "testing"
- "time"
-
- ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestECIntegration tests the EC implementation with the admin server
-func TestECIntegration(t *testing.T) {
- t.Logf("Starting EC integration test")
-
- // Step 1: Create admin server
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 30 * time.Minute, // EC takes longer
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 2: Register an EC-capable worker
- worker := &types.Worker{
- ID: "ec-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register EC worker: %v", err)
- }
- t.Logf("Successfully registered EC worker %s", worker.ID)
-
- // Step 3: Create an EC task
- ecTask := &types.Task{
- ID: "ec-task-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 12345,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
- "master_client": "localhost:9333",
- "work_dir": "/tmp/seaweedfs_ec_work",
- "collection": "test",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(ecTask)
- if err != nil {
- t.Fatalf("Failed to queue EC task: %v", err)
- }
- t.Logf("Successfully queued EC task %s for volume %d", ecTask.ID, ecTask.VolumeID)
-
- // Step 4: Worker requests the task
- assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Fatalf("Failed to request EC task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("EC worker got task: %s (%s) for volume %d",
- assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
-
- // Step 5: Simulate EC task execution phases
- t.Logf("Simulating EC task execution phases")
-
- // Phase 1: Copying volume data
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0)
- if err != nil {
- t.Errorf("Failed to update progress (copying): %v", err)
- }
- t.Logf("Phase 1: Volume data copied to local disk")
-
- // Phase 2: Marking read-only
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
- if err != nil {
- t.Errorf("Failed to update progress (read-only): %v", err)
- }
- t.Logf("Phase 2: Source volume marked as read-only")
-
- // Phase 3: Local EC encoding
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0)
- if err != nil {
- t.Errorf("Failed to update progress (encoding): %v", err)
- }
- t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)")
-
- // Phase 4: Calculating optimal placement
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0)
- if err != nil {
- t.Errorf("Failed to update progress (placement): %v", err)
- }
- t.Logf("Phase 4: Optimal shard placement calculated with affinity")
-
- // Phase 5: Distributing shards
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0)
- if err != nil {
- t.Errorf("Failed to update progress (distribution): %v", err)
- }
- t.Logf("Phase 5: Shards distributed across servers with rack diversity")
-
- // Phase 6: Verification and cleanup
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress (completion): %v", err)
- }
- t.Logf("Phase 6: Verification and cleanup completed")
-
- // Step 6: Complete the task
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete EC task: %v", err)
- }
- t.Logf("Successfully completed EC task %s", assignedTask.ID)
- } else {
- t.Logf("No EC task was assigned (expected in test environment)")
- }
-
- // Step 7: Verify task completion
- stats := adminServer.GetSystemStats()
- t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d completed tasks", len(history))
-
- if len(history) > 0 {
- lastEntry := history[len(history)-1]
- t.Logf("Last completed task: %s (%s) - Duration: %v",
- lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
-
- if lastEntry.TaskType == types.TaskTypeErasureCoding {
- t.Logf("EC task completed successfully")
- }
- }
-
- t.Logf("EC integration test completed successfully")
-}
-
-// TestECTaskValidation tests the EC task validation
-func TestECTaskValidation(t *testing.T) {
- t.Logf("Testing EC task validation")
-
- // Create a temporary work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
- err := os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- // Create EC task
- ecTask := ec_task.NewTaskWithParams(
- "localhost:8080", // source server
- 12345, // volume ID
- "localhost:9333", // master client
- workDir, // work directory
- )
-
- // Test validation with valid parameters
- validParams := types.TaskParams{
- VolumeID: 12345,
- Server: "localhost:8080",
- Collection: "test",
- Parameters: map[string]interface{}{
- "volume_size": int64(32 * 1024 * 1024 * 1024),
- },
- }
-
- err = ecTask.Validate(validParams)
- if err != nil {
- t.Errorf("Valid parameters should pass validation: %v", err)
- }
-
- // Test validation with invalid parameters
- invalidParams := types.TaskParams{
- VolumeID: 0, // Invalid volume ID
- Server: "", // Empty server
- }
-
- err = ecTask.Validate(invalidParams)
- if err == nil {
- t.Errorf("Invalid parameters should fail validation")
- }
-
- // Test time estimation
- estimatedTime := ecTask.EstimateTime(validParams)
- t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime)
-
- if estimatedTime < 20*time.Minute {
- t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime)
- }
-
- t.Logf("EC task validation completed successfully")
-}
-
-// TestECFeatures tests specific EC features
-func TestECFeatures(t *testing.T) {
- t.Logf("Testing EC features")
-
- // Create temporary work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
- err := os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- ecTask := ec_task.NewTaskWithParams(
- "localhost:8080",
- 54321,
- "localhost:9333",
- workDir,
- )
-
- // Test step tracking
- t.Logf("Testing step tracking functionality")
-
- currentStep := ecTask.GetCurrentStep()
- t.Logf("Initial current step: %s", currentStep)
-
- progress := ecTask.GetProgress()
- t.Logf("Initial progress: %.1f%%", progress)
-
- // Test parameter extraction
- params := types.TaskParams{
- VolumeID: 54321,
- Server: "localhost:8080",
- Collection: "features_test",
- Parameters: map[string]interface{}{
- "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
- "data_shards": 10,
- "parity_shards": 4,
- "affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
- },
- }
-
- estimatedTime := ecTask.EstimateTime(params)
- expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
-
- t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
-
- if estimatedTime < expectedMinTime {
- t.Errorf("Time estimate seems too low for 64GB volume")
- }
-
- t.Logf("EC features test completed successfully")
-}
-
-// TestECTaskComparison tests EC implementation features
-func TestECTaskComparison(t *testing.T) {
- t.Logf("Testing EC implementation features")
-
- // EC task estimation
- params := types.TaskParams{
- VolumeID: 11111,
- Server: "localhost:8080",
- Parameters: map[string]interface{}{
- "volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
- },
- }
-
- // Create task
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison")
- defer os.RemoveAll(workDir)
-
- ecTask := ec_task.NewTaskWithParams(
- "localhost:8080",
- 22222,
- "localhost:9333",
- workDir,
- )
- estimatedTime := ecTask.EstimateTime(params)
-
- t.Logf("EC task estimated time: %v", estimatedTime)
-
- // Test feature capabilities
- t.Logf("EC implementation features:")
- t.Logf(" - Local volume data copying with progress tracking")
- t.Logf(" - Local Reed-Solomon encoding (10+4 shards)")
- t.Logf(" - Intelligent shard placement with rack awareness")
- t.Logf(" - Load balancing across available servers")
- t.Logf(" - Backup server selection for redundancy")
- t.Logf(" - Detailed step-by-step progress tracking")
- t.Logf(" - Comprehensive error handling and recovery")
-
- t.Logf("EC implementation test completed successfully")
-}
diff --git a/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go b/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go
deleted file mode 100644
index 37132d858..000000000
--- a/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go
+++ /dev/null
@@ -1,324 +0,0 @@
-package task
-
-import (
- "os"
- "path/filepath"
- "testing"
- "time"
-
- ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestEnhancedECIntegration tests the enhanced EC implementation with the admin server
-func TestEnhancedECIntegration(t *testing.T) {
- t.Logf("Starting enhanced EC integration test")
-
- // Step 1: Create admin server
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 30 * time.Minute, // EC takes longer
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 2: Register an EC-capable worker
- worker := &types.Worker{
- ID: "ec-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register EC worker: %v", err)
- }
- t.Logf("Successfully registered EC worker %s", worker.ID)
-
- // Step 3: Create an EC task
- ecTask := &types.Task{
- ID: "enhanced-ec-task-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 12345,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
- "master_client": "localhost:9333",
- "work_dir": "/tmp/seaweedfs_ec_work",
- "collection": "test",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(ecTask)
- if err != nil {
- t.Fatalf("Failed to queue EC task: %v", err)
- }
- t.Logf("Successfully queued enhanced EC task %s for volume %d", ecTask.ID, ecTask.VolumeID)
-
- // Step 4: Worker requests the task
- assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Fatalf("Failed to request EC task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("EC worker got task: %s (%s) for volume %d",
- assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
-
- // Step 5: Simulate enhanced EC task execution progress
- t.Logf("Simulating enhanced EC task execution phases")
-
- // Phase 1: Copying volume data
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0)
- if err != nil {
- t.Errorf("Failed to update progress (copying): %v", err)
- }
- t.Logf("Phase 1: Volume data copied to local disk")
-
- // Phase 2: Marking read-only
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
- if err != nil {
- t.Errorf("Failed to update progress (read-only): %v", err)
- }
- t.Logf("Phase 2: Source volume marked as read-only")
-
- // Phase 3: Local EC encoding
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0)
- if err != nil {
- t.Errorf("Failed to update progress (encoding): %v", err)
- }
- t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)")
-
- // Phase 4: Calculating optimal placement
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0)
- if err != nil {
- t.Errorf("Failed to update progress (placement): %v", err)
- }
- t.Logf("Phase 4: Optimal shard placement calculated with affinity")
-
- // Phase 5: Distributing shards
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0)
- if err != nil {
- t.Errorf("Failed to update progress (distribution): %v", err)
- }
- t.Logf("Phase 5: Shards distributed across servers with rack diversity")
-
- // Phase 6: Verification and cleanup
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress (completion): %v", err)
- }
- t.Logf("Phase 6: Verification and cleanup completed")
-
- // Step 6: Complete the task
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete EC task: %v", err)
- }
- t.Logf("Successfully completed enhanced EC task %s", assignedTask.ID)
- } else {
- t.Logf("No EC task was assigned (expected in test environment)")
- }
-
- // Step 7: Verify task completion
- stats := adminServer.GetSystemStats()
- t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d completed tasks", len(history))
-
- if len(history) > 0 {
- lastEntry := history[len(history)-1]
- t.Logf("Last completed task: %s (%s) - Duration: %v",
- lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
-
- if lastEntry.TaskType == types.TaskTypeErasureCoding {
- t.Logf("Enhanced EC task completed successfully")
- }
- }
-
- t.Logf("Enhanced EC integration test completed successfully")
-}
-
-// TestEnhancedECTaskValidation tests the enhanced EC task validation
-func TestEnhancedECTaskValidation(t *testing.T) {
- t.Logf("Testing enhanced EC task validation")
-
- // Create a temporary work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
- err := os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- // Create enhanced EC task
- enhancedTask := ec_task.NewEnhancedECTask(
- "localhost:8080", // source server
- 12345, // volume ID
- "localhost:9333", // master client
- workDir, // work directory
- )
-
- // Test validation with valid parameters
- validParams := types.TaskParams{
- VolumeID: 12345,
- Server: "localhost:8080",
- Collection: "test",
- Parameters: map[string]interface{}{
- "volume_size": int64(32 * 1024 * 1024 * 1024),
- },
- }
-
- err = enhancedTask.Validate(validParams)
- if err != nil {
- t.Errorf("Valid parameters should pass validation: %v", err)
- }
-
- // Test validation with invalid parameters
- invalidParams := types.TaskParams{
- VolumeID: 0, // Invalid volume ID
- Server: "", // Empty server
- }
-
- err = enhancedTask.Validate(invalidParams)
- if err == nil {
- t.Errorf("Invalid parameters should fail validation")
- }
-
- // Test time estimation
- estimatedTime := enhancedTask.EstimateTime(validParams)
- t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime)
-
- if estimatedTime < 20*time.Minute {
- t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime)
- }
-
- t.Logf("Enhanced EC task validation completed successfully")
-}
-
-// TestEnhancedECFeatures tests specific enhanced EC features
-func TestEnhancedECFeatures(t *testing.T) {
- t.Logf("Testing enhanced EC features")
-
- // Create temporary work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
- err := os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- enhancedTask := ec_task.NewEnhancedECTask(
- "localhost:8080",
- 54321,
- "localhost:9333",
- workDir,
- )
-
- // Test step tracking
- t.Logf("Testing step tracking functionality")
-
- currentStep := enhancedTask.GetCurrentStep()
- t.Logf("Initial current step: %s", currentStep)
-
- progress := enhancedTask.GetProgress()
- t.Logf("Initial progress: %.1f%%", progress)
-
- // Test parameter extraction
- params := types.TaskParams{
- VolumeID: 54321,
- Server: "localhost:8080",
- Collection: "enhanced_test",
- Parameters: map[string]interface{}{
- "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
- "data_shards": 10,
- "parity_shards": 4,
- "affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
- },
- }
-
- estimatedTime := enhancedTask.EstimateTime(params)
- expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
-
- t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
-
- if estimatedTime < expectedMinTime {
- t.Errorf("Time estimate seems too low for 64GB volume")
- }
-
- t.Logf("Enhanced EC features test completed successfully")
-}
-
-// TestECTaskComparison compares basic vs enhanced EC implementations
-func TestECTaskComparison(t *testing.T) {
- t.Logf("Comparing basic vs enhanced EC implementations")
-
- // Basic EC task estimation
- basicParams := types.TaskParams{
- VolumeID: 11111,
- Server: "localhost:8080",
- Parameters: map[string]interface{}{
- "volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
- },
- }
-
- // Create basic task (existing implementation)
- basicTask := ec_task.NewTask("localhost:8080", 11111)
- basicTime := basicTask.EstimateTime(basicParams)
-
- // Create enhanced task
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison")
- defer os.RemoveAll(workDir)
-
- enhancedTask := ec_task.NewEnhancedECTask(
- "localhost:8080",
- 22222,
- "localhost:9333",
- workDir,
- )
- enhancedTime := enhancedTask.EstimateTime(basicParams)
-
- t.Logf("Basic EC task estimated time: %v", basicTime)
- t.Logf("Enhanced EC task estimated time: %v", enhancedTime)
-
- // Enhanced should take longer due to additional processing
- if enhancedTime <= basicTime {
- t.Logf("Note: Enhanced EC might take longer due to local processing and smart distribution")
- }
-
- // Test feature differences
- t.Logf("Basic EC features:")
- t.Logf(" - Direct volume server EC generation")
- t.Logf(" - Simple shard mounting")
- t.Logf(" - No custom placement logic")
-
- t.Logf("Enhanced EC features:")
- t.Logf(" - Local volume data copying")
- t.Logf(" - Local Reed-Solomon encoding")
- t.Logf(" - Intelligent shard placement with affinity")
- t.Logf(" - Rack diversity for data shards")
- t.Logf(" - Load balancing across servers")
- t.Logf(" - Backup server selection")
- t.Logf(" - Detailed progress tracking")
-
- t.Logf("EC task comparison completed successfully")
-}
diff --git a/weed/admin/task/ec_test_standalone/go.mod b/weed/admin/task/ec_test_standalone/go.mod
deleted file mode 100644
index 8c09ecf5c..000000000
--- a/weed/admin/task/ec_test_standalone/go.mod
+++ /dev/null
@@ -1,3 +0,0 @@
-module ec_test
-
-go 1.24.1
diff --git a/weed/admin/task/ec_test_standalone/minimal_admin_server.go b/weed/admin/task/ec_test_standalone/minimal_admin_server.go
deleted file mode 100644
index d7dbfcd96..000000000
--- a/weed/admin/task/ec_test_standalone/minimal_admin_server.go
+++ /dev/null
@@ -1,324 +0,0 @@
-package task
-
-import (
- "fmt"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// MinimalAdminConfig contains configuration for the minimal admin server
-type MinimalAdminConfig struct {
- ScanInterval time.Duration
- WorkerTimeout time.Duration
- TaskTimeout time.Duration
- MaxRetries int
- ReconcileInterval time.Duration
- EnableFailureRecovery bool
- MaxConcurrentTasks int
-}
-
-// MinimalAdminServer manages workers and tasks with a simple implementation
-type MinimalAdminServer struct {
- config *MinimalAdminConfig
- masterClient *wdclient.MasterClient
- running bool
- mutex sync.RWMutex
-
- // Task management
- tasks map[string]*types.Task
- taskQueue []*types.Task
- activeTasks map[string]*types.Task
-
- // Worker management
- workers map[string]*types.Worker
- workerStatus map[string]*types.WorkerStatus
-
- // Task history
- taskHistory []MinimalTaskHistoryEntry
-}
-
-// MinimalTaskHistoryEntry represents a single task history entry
-type MinimalTaskHistoryEntry struct {
- TaskID string
- TaskType types.TaskType
- VolumeID uint32
- WorkerID string
- Status types.TaskStatus
- StartedAt time.Time
- CompletedAt time.Time
- Duration time.Duration
- ErrorMessage string
-}
-
-// MinimalSystemStats represents system statistics
-type MinimalSystemStats struct {
- ActiveTasks int
- QueuedTasks int
- ActiveWorkers int
- TotalTasks int
-}
-
-// NewMinimalAdminServer creates a new minimal admin server
-func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer {
- return &MinimalAdminServer{
- config: config,
- masterClient: masterClient,
- tasks: make(map[string]*types.Task),
- taskQueue: make([]*types.Task, 0),
- activeTasks: make(map[string]*types.Task),
- workers: make(map[string]*types.Worker),
- workerStatus: make(map[string]*types.WorkerStatus),
- taskHistory: make([]MinimalTaskHistoryEntry, 0),
- }
-}
-
-// Start starts the minimal admin server
-func (as *MinimalAdminServer) Start() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if as.running {
- return fmt.Errorf("admin server is already running")
- }
-
- as.running = true
- return nil
-}
-
-// Stop stops the minimal admin server
-func (as *MinimalAdminServer) Stop() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- as.running = false
- return nil
-}
-
-// RegisterWorker registers a new worker
-func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- as.workers[worker.ID] = worker
- as.workerStatus[worker.ID] = &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- return nil
-}
-
-// QueueTask adds a new task to the task queue
-func (as *MinimalAdminServer) QueueTask(task *types.Task) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- if task.ID == "" {
- task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
- }
-
- task.Status = types.TaskStatusPending
- task.CreatedAt = time.Now()
-
- as.tasks[task.ID] = task
- as.taskQueue = append(as.taskQueue, task)
-
- return nil
-}
-
-// RequestTask requests a task for a worker
-func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return nil, fmt.Errorf("admin server is not running")
- }
-
- // Check if worker exists
- worker, exists := as.workers[workerID]
- if !exists {
- return nil, fmt.Errorf("worker %s not found", workerID)
- }
-
- // Check if worker has capacity
- status := as.workerStatus[workerID]
- if status.CurrentLoad >= worker.MaxConcurrent {
- return nil, nil // No capacity
- }
-
- // Find a suitable task
- for i, task := range as.taskQueue {
- if task.Status != types.TaskStatusPending {
- continue
- }
-
- // Check if worker can handle this task type
- canHandle := false
- for _, capability := range capabilities {
- if task.Type == capability {
- canHandle = true
- break
- }
- }
-
- if canHandle {
- // Assign task to worker
- task.Status = types.TaskStatusInProgress
- task.WorkerID = workerID
- now := time.Now()
- task.StartedAt = &now
-
- // Move task from queue to active tasks
- as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
- as.activeTasks[task.ID] = task
-
- // Update worker load
- status.CurrentLoad++
-
- return task, nil
- }
- }
-
- return nil, nil // No suitable task found
-}
-
-// UpdateTaskProgress updates task progress
-func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- task.Progress = progress
-
- return nil
-}
-
-// CompleteTask marks a task as completed
-func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- // Update task status
- if success {
- task.Status = types.TaskStatusCompleted
- } else {
- task.Status = types.TaskStatusFailed
- task.Error = errorMessage
- }
-
- now := time.Now()
- task.CompletedAt = &now
-
- // Remove from active tasks
- delete(as.activeTasks, taskID)
-
- // Update worker load
- if task.WorkerID != "" {
- if status, exists := as.workerStatus[task.WorkerID]; exists {
- status.CurrentLoad--
- }
- }
-
- // Add to history
- var duration time.Duration
- if task.StartedAt != nil {
- duration = now.Sub(*task.StartedAt)
- }
-
- entry := MinimalTaskHistoryEntry{
- TaskID: task.ID,
- TaskType: task.Type,
- VolumeID: task.VolumeID,
- WorkerID: task.WorkerID,
- Status: task.Status,
- StartedAt: *task.StartedAt,
- CompletedAt: now,
- Duration: duration,
- ErrorMessage: errorMessage,
- }
- as.taskHistory = append(as.taskHistory, entry)
-
- return nil
-}
-
-// UpdateWorkerHeartbeat updates worker heartbeat
-func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- worker, exists := as.workers[workerID]
- if !exists {
- return fmt.Errorf("worker %s not found", workerID)
- }
-
- worker.LastHeartbeat = time.Now()
- as.workerStatus[workerID] = status
-
- return nil
-}
-
-// GetSystemStats returns system statistics
-func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- activeWorkers := 0
- for _, status := range as.workerStatus {
- if status.Status == "active" {
- activeWorkers++
- }
- }
-
- return &MinimalSystemStats{
- ActiveTasks: len(as.activeTasks),
- QueuedTasks: len(as.taskQueue),
- ActiveWorkers: activeWorkers,
- TotalTasks: len(as.tasks),
- }
-}
-
-// GetQueuedTaskCount returns the number of queued tasks
-func (as *MinimalAdminServer) GetQueuedTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.taskQueue)
-}
-
-// GetActiveTaskCount returns the number of active tasks
-func (as *MinimalAdminServer) GetActiveTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.activeTasks)
-}
-
-// GetTaskHistory returns task history
-func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- // Return a copy of the history
- history := make([]MinimalTaskHistoryEntry, len(as.taskHistory))
- copy(history, as.taskHistory)
- return history
-}
diff --git a/weed/admin/task/ec_test_standalone/minimal_integration_test.go b/weed/admin/task/ec_test_standalone/minimal_integration_test.go
deleted file mode 100644
index c690456ef..000000000
--- a/weed/admin/task/ec_test_standalone/minimal_integration_test.go
+++ /dev/null
@@ -1,434 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
-func TestMinimalIntegration(t *testing.T) {
- t.Logf("Starting minimal integration test")
-
- // Step 1: Create a minimal admin server configuration
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- // Step 2: Create minimal admin server with nil master client (for testing)
- adminServer := NewMinimalAdminServer(config, nil)
-
- // Step 3: Start admin server
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 4: Test worker registration
- t.Logf("Testing worker registration")
-
- worker := &types.Worker{
- ID: "test-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
- t.Logf("Successfully registered worker %s", worker.ID)
-
- // Step 5: Test task queueing
- t.Logf("Testing task queueing")
-
- task := &types.Task{
- ID: "test-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task: %v", err)
- }
- t.Logf("Successfully queued task %s", task.ID)
-
- // Step 6: Test task request by worker
- t.Logf("Testing task request")
-
- assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Fatalf("Failed to request task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
-
- // Step 7: Test task progress updates
- t.Logf("Testing task progress updates")
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 25%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 50%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 75%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 100%%: %v", err)
- }
-
- // Step 8: Test task completion
- t.Logf("Testing task completion")
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("Successfully completed task %s", assignedTask.ID)
- } else {
- t.Logf("No task was assigned (queue might be empty)")
- }
-
- // Step 9: Test basic metrics
- t.Logf("Testing basic metrics")
-
- stats := adminServer.GetSystemStats()
- if stats != nil {
- t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
- }
-
- queuedCount := adminServer.GetQueuedTaskCount()
- activeCount := adminServer.GetActiveTaskCount()
- t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
-
- // Step 10: Test task history
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d entries", len(history))
-
- if len(history) > 0 {
- lastEntry := history[len(history)-1]
- t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v",
- lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration)
- }
-
- t.Logf("Minimal integration test completed successfully")
-}
-
-// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
-func TestMinimalWorkerHeartbeat(t *testing.T) {
- t.Logf("Testing minimal worker heartbeat")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register a worker
- worker := &types.Worker{
- ID: "heartbeat-worker",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
-
- // Test heartbeat update
- status := &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
- if err != nil {
- t.Errorf("Failed to update worker heartbeat: %v", err)
- }
-
- t.Logf("Minimal worker heartbeat test completed successfully")
-}
-
-// TestMinimalTaskQueueOperations tests task queue operations
-func TestMinimalTaskQueueOperations(t *testing.T) {
- t.Logf("Testing minimal task queue operations")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Test queuing multiple tasks
- taskCount := 3
- for i := 0; i < taskCount; i++ {
- task := &types.Task{
- ID: fmt.Sprintf("queue-test-task-%d", i),
- Type: types.TaskTypeVacuum,
- VolumeID: uint32(2000 + i),
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Errorf("Failed to queue task %d: %v", i, err)
- }
- }
-
- // Check queue size
- queuedCount := adminServer.GetQueuedTaskCount()
- if queuedCount != taskCount {
- t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount)
- }
-
- t.Logf("Minimal task queue operations test completed successfully")
-}
-
-// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
-func TestMinimalFullWorkflow(t *testing.T) {
- t.Logf("Testing minimal full workflow")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register multiple workers with different capabilities
- workers := []*types.Worker{
- {
- ID: "vacuum-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "ec-worker-1",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "multi-worker-1",
- Address: "localhost:9003",
- Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
- MaxConcurrent: 3,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- }
-
- for _, worker := range workers {
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
- }
- t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
- }
-
- // Create multiple tasks of different types
- tasks := []*types.Task{
- {
- ID: "vacuum-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 3001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.4",
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "ec-task-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 3002,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "shard_count": "14",
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "vacuum-task-2",
- Type: types.TaskTypeVacuum,
- VolumeID: 3003,
- Server: "localhost:8081",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityLow,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.5",
- },
- CreatedAt: time.Now(),
- },
- }
-
- for _, task := range tasks {
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task %s: %v", task.ID, err)
- }
- t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
- }
-
- // Test task assignment to different workers
- t.Logf("Testing task assignments")
-
- // Vacuum worker should get vacuum tasks
- assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Errorf("Failed to request task for vacuum worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // EC worker should get EC tasks
- assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for EC worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // Multi-capability worker should be able to get any remaining task
- assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for multi worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // Check final statistics
- stats := adminServer.GetSystemStats()
- t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d completed tasks", len(history))
-
- for _, entry := range history {
- t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v",
- entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration)
- }
-
- t.Logf("Minimal full workflow test completed successfully")
-}
diff --git a/weed/admin/task/ec_worker_test.go b/weed/admin/task/ec_worker_test.go
deleted file mode 100644
index 75286c08f..000000000
--- a/weed/admin/task/ec_worker_test.go
+++ /dev/null
@@ -1,488 +0,0 @@
-package task
-
-import (
- "os"
- "path/filepath"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestECWorkerIntegration tests the complete EC worker functionality
-func TestECWorkerIntegration(t *testing.T) {
- t.Logf("Starting EC worker integration test")
-
- // Step 1: Create admin server with EC configuration
- config := &MinimalAdminConfig{
- ScanInterval: 5 * time.Second,
- WorkerTimeout: 60 * time.Second,
- TaskTimeout: 45 * time.Minute, // EC takes longer
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 1, // One at a time for EC
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
- t.Logf("✓ Admin server started successfully")
-
- // Step 2: Register EC-capable worker
- worker := &types.Worker{
- ID: "ec-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register EC worker: %v", err)
- }
- t.Logf("✓ EC worker registered: %s", worker.ID)
-
- // Step 3: Create work directory for EC processing
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
- err = os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
- t.Logf("✓ Work directory created: %s", workDir)
-
- // Step 4: Create EC task with comprehensive parameters
- ecTask := &types.Task{
- ID: "ec-test-task-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 54321,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB volume
- "master_client": "localhost:9333",
- "work_dir": workDir,
- "collection": "test",
- "data_shards": 10,
- "parity_shards": 4,
- "rack_aware": true,
- "load_balance": true,
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(ecTask)
- if err != nil {
- t.Fatalf("Failed to queue EC task: %v", err)
- }
- t.Logf("✓ EC task queued: %s for volume %d", ecTask.ID, ecTask.VolumeID)
-
- // Step 5: Worker requests and receives the EC task
- assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Fatalf("Failed to request EC task: %v", err)
- }
-
- if assignedTask == nil {
- t.Fatalf("No EC task was assigned")
- }
-
- t.Logf("✓ EC task assigned: %s (%s) for volume %d",
- assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
-
- // Step 6: Test EC task creation and validation
- t.Logf("Testing EC task creation and validation")
-
- // Create EC task instance directly
- factory := erasure_coding.NewFactory()
- taskParams := types.TaskParams{
- VolumeID: assignedTask.VolumeID,
- Server: assignedTask.Server,
- Collection: "test",
- Parameters: assignedTask.Parameters,
- }
-
- taskInstance, err := factory.Create(taskParams)
- if err != nil {
- t.Fatalf("Failed to create EC task instance: %v", err)
- }
- t.Logf("✓ EC task instance created successfully")
-
- // Step 7: Validate task parameters
- err = taskInstance.Validate(taskParams)
- if err != nil {
- t.Errorf("EC task validation failed: %v", err)
- } else {
- t.Logf("✓ EC task validation passed")
- }
-
- // Step 8: Test time estimation
- estimatedTime := taskInstance.EstimateTime(taskParams)
- expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB for 64GB
-
- t.Logf("✓ EC estimated time: %v (minimum expected: %v)", estimatedTime, expectedMinTime)
-
- if estimatedTime < expectedMinTime {
- t.Logf("⚠ Note: Estimated time seems optimistic for 64GB volume")
- }
-
- // Step 9: Simulate EC task execution phases
- t.Logf("Simulating EC execution phases:")
-
- phases := []struct {
- progress float64
- phase string
- }{
- {5.0, "Initializing EC processing"},
- {15.0, "Volume data copied to local disk with progress tracking"},
- {25.0, "Source volume marked as read-only"},
- {45.0, "Local Reed-Solomon encoding (10+4 shards) completed"},
- {60.0, "Created 14 EC shards with verification"},
- {70.0, "Optimal shard placement calculated with rack awareness"},
- {85.0, "Intelligent shard distribution with load balancing"},
- {95.0, "Shard placement verified across multiple racks"},
- {100.0, "EC processing completed with cleanup"},
- }
-
- for _, phase := range phases {
- err = adminServer.UpdateTaskProgress(assignedTask.ID, phase.progress)
- if err != nil {
- t.Errorf("Failed to update task progress to %.1f%%: %v", phase.progress, err)
- } else {
- t.Logf(" %.1f%% - %s", phase.progress, phase.phase)
- }
- time.Sleep(50 * time.Millisecond) // Simulate processing time
- }
-
- // Step 10: Complete the EC task
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete EC task: %v", err)
- } else {
- t.Logf("✓ EC task completed successfully")
- }
-
- // Step 11: Verify EC task completion and metrics
- stats := adminServer.GetSystemStats()
- t.Logf("✓ Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("✓ Task history contains %d completed tasks", len(history))
-
- if len(history) > 0 {
- lastEntry := history[len(history)-1]
- t.Logf("✓ Last completed task: %s (%s) - Duration: %v",
- lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
-
- if lastEntry.TaskType == types.TaskTypeErasureCoding {
- t.Logf("✅ EC task execution verified!")
- }
- }
-
- t.Logf("✅ EC worker integration test completed successfully")
-}
-
-// TestECFeatureValidation tests specific EC features
-func TestECFeatureValidation(t *testing.T) {
- t.Logf("Testing EC feature validation")
-
- // Create work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
- err := os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- // Test EC task features
- ecTask := erasure_coding.NewTaskWithParams(
- "localhost:8080", // source server
- 98765, // volume ID
- "localhost:9333", // master client
- workDir, // work directory
- )
-
- // Test current step tracking
- currentStep := ecTask.GetCurrentStep()
- t.Logf("✓ Initial current step: '%s'", currentStep)
-
- initialProgress := ecTask.GetProgress()
- t.Logf("✓ Initial progress: %.1f%%", initialProgress)
-
- // Test parameter validation with features
- validParams := types.TaskParams{
- VolumeID: 98765,
- Server: "localhost:8080",
- Collection: "features_test",
- Parameters: map[string]interface{}{
- "volume_size": int64(128 * 1024 * 1024 * 1024), // 128GB
- "master_client": "localhost:9333",
- "work_dir": workDir,
- "data_shards": 10,
- "parity_shards": 4,
- "rack_awareness": true,
- "load_balancing": true,
- "backup_servers": 2,
- "affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
- },
- }
-
- err = ecTask.Validate(validParams)
- if err != nil {
- t.Errorf("Valid parameters should pass validation: %v", err)
- } else {
- t.Logf("✓ Parameter validation passed")
- }
-
- // Test time estimation for large volume
- estimatedTime := ecTask.EstimateTime(validParams)
- expectedMinTime := time.Duration(128*2) * time.Minute // 2 minutes per GB
-
- t.Logf("✓ 128GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
-
- if estimatedTime < expectedMinTime {
- t.Errorf("Time estimate seems too low for 128GB volume")
- }
-
- // Test invalid parameters
- invalidParams := types.TaskParams{
- VolumeID: 0, // Invalid
- Server: "", // Invalid
- }
-
- err = ecTask.Validate(invalidParams)
- if err == nil {
- t.Errorf("Invalid parameters should fail validation")
- } else {
- t.Logf("✓ Invalid parameter validation correctly failed: %v", err)
- }
-
- t.Logf("✅ EC feature validation completed successfully")
-}
-
-// TestECWorkflow tests the complete EC workflow
-func TestECWorkflow(t *testing.T) {
- t.Logf("Testing complete EC workflow")
-
- // Create admin server
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 60 * time.Minute,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 1,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register multiple workers with different capabilities
- workers := []*types.Worker{
- {
- ID: "ec-specialist-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "vacuum-worker-1",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "multi-capability-worker-1",
- Address: "localhost:9003",
- Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- }
-
- for _, worker := range workers {
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
- }
- t.Logf("✓ Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
- }
-
- // Create test work directory
- workDir := filepath.Join(os.TempDir(), "seaweedfs_workflow_test")
- err = os.MkdirAll(workDir, 0755)
- if err != nil {
- t.Fatalf("Failed to create work directory: %v", err)
- }
- defer os.RemoveAll(workDir)
-
- // Create multiple tasks of different types
- tasks := []*types.Task{
- {
- ID: "ec-workflow-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 11111,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "volume_size": int64(50 * 1024 * 1024 * 1024),
- "master_client": "localhost:9333",
- "work_dir": workDir,
- "collection": "workflow_test",
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "vacuum-workflow-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 22222,
- Server: "localhost:8081",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.4",
- "volume_size": int64(20 * 1024 * 1024 * 1024),
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "ec-workflow-2",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 33333,
- Server: "localhost:8082",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "volume_size": int64(80 * 1024 * 1024 * 1024),
- "master_client": "localhost:9333",
- "work_dir": workDir,
- "collection": "workflow_test",
- },
- CreatedAt: time.Now(),
- },
- }
-
- // Queue all tasks
- for _, task := range tasks {
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task %s: %v", task.ID, err)
- }
- t.Logf("✓ Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
- }
-
- // Test task assignment to appropriate workers
- t.Logf("Testing task assignments to appropriate workers")
-
- // EC specialist should get EC tasks
- assignedTask, err := adminServer.RequestTask("ec-specialist-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for EC specialist: %v", err)
- } else if assignedTask != nil {
- t.Logf("✓ EC specialist got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("✓ EC task completed by specialist")
- }
-
- // Vacuum worker should get vacuum tasks
- assignedTask, err = adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Errorf("Failed to request task for vacuum worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("✓ Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("✓ Vacuum task completed by vacuum worker")
- }
-
- // Multi-capability worker should get remaining tasks
- assignedTask, err = adminServer.RequestTask("multi-capability-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for multi-capability worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("✓ Multi-capability worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("✓ Task completed by multi-capability worker")
- }
-
- // Check final workflow statistics
- stats := adminServer.GetSystemStats()
- t.Logf("✓ Final workflow stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("✓ Workflow history contains %d completed tasks", len(history))
-
- // Analyze task completion by type
- ecTasks := 0
- vacuumTasks := 0
-
- for _, entry := range history {
- switch entry.TaskType {
- case types.TaskTypeErasureCoding:
- ecTasks++
- t.Logf(" EC: %s - Worker: %s, Duration: %v",
- entry.TaskID, entry.WorkerID, entry.Duration)
- case types.TaskTypeVacuum:
- vacuumTasks++
- t.Logf(" Vacuum: %s - Worker: %s, Duration: %v",
- entry.TaskID, entry.WorkerID, entry.Duration)
- }
- }
-
- t.Logf("✓ Completed tasks: %d EC, %d Vacuum", ecTasks, vacuumTasks)
- t.Logf("✅ EC workflow test completed successfully")
-}
diff --git a/weed/admin/task/example_usage.go b/weed/admin/task/example_usage.go
deleted file mode 100644
index 469fcfdc4..000000000
--- a/weed/admin/task/example_usage.go
+++ /dev/null
@@ -1,346 +0,0 @@
-package task
-
-import (
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// ExampleUsage demonstrates how to use the task distribution system
-func ExampleUsage() {
- glog.Infof("=== SeaweedFS Task Distribution System Example ===")
-
- // Example 1: Setting up the Admin Server
- setupAdminServerExample()
-
- // Example 2: Simulating Workers
- simulateWorkersExample()
-
- // Example 3: Running Simulations
- runSimulationsExample()
-
- // Example 4: Demonstrating Features
- demonstrateFeaturesExample()
-}
-
-// setupAdminServerExample shows how to set up the admin server
-func setupAdminServerExample() {
- glog.Infof("\n--- Example 1: Setting up Admin Server ---")
-
- // Create master client (in real usage, this would connect to actual master)
- masterClient := &wdclient.MasterClient{} // Simplified for example
-
- // Create admin server configuration
- config := &AdminConfig{
- ScanInterval: 30 * time.Minute,
- WorkerTimeout: 5 * time.Minute,
- TaskTimeout: 10 * time.Minute,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 10,
- }
-
- // Create admin server
- adminServer := NewAdminServer(config, masterClient)
-
- // Start the admin server
- if err := adminServer.Start(); err != nil {
- glog.Errorf("Failed to start admin server: %v", err)
- return
- }
-
- glog.Infof("✓ Admin server started with configuration:")
- glog.Infof(" - Scan Interval: %v", config.ScanInterval)
- glog.Infof(" - Worker Timeout: %v", config.WorkerTimeout)
- glog.Infof(" - Max Concurrent Tasks: %d", config.MaxConcurrentTasks)
-
- // Simulate some operations
- time.Sleep(2 * time.Second)
-
- // Stop the admin server
- adminServer.Stop()
- glog.Infof("✓ Admin server stopped gracefully")
-}
-
-// simulateWorkersExample shows how workers would register and operate
-func simulateWorkersExample() {
- glog.Infof("\n--- Example 2: Worker Registration and Operation ---")
-
- // Create mock workers
- workers := []*types.Worker{
- {
- ID: "worker-ec-01",
- Address: "192.168.1.100:8080",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- },
- {
- ID: "worker-vacuum-01",
- Address: "192.168.1.101:8080",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 3,
- Status: "active",
- CurrentLoad: 0,
- },
- {
- ID: "worker-multi-01",
- Address: "192.168.1.102:8080",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- },
- }
-
- // Create worker registry
- registry := NewWorkerRegistry()
-
- // Register workers
- for _, worker := range workers {
- if err := registry.RegisterWorker(worker); err != nil {
- glog.Errorf("Failed to register worker %s: %v", worker.ID, err)
- } else {
- glog.Infof("✓ Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities)
- }
- }
-
- // Demonstrate worker selection
- bestECWorker := registry.GetBestWorkerForTask(types.TaskTypeErasureCoding)
- if bestECWorker != nil {
- glog.Infof("✓ Best worker for EC tasks: %s", bestECWorker.ID)
- }
-
- bestVacuumWorker := registry.GetBestWorkerForTask(types.TaskTypeVacuum)
- if bestVacuumWorker != nil {
- glog.Infof("✓ Best worker for vacuum tasks: %s", bestVacuumWorker.ID)
- }
-
- // Show registry statistics
- stats := registry.GetRegistryStats()
- glog.Infof("✓ Registry statistics: %+v", stats)
-}
-
-// runSimulationsExample shows how to run simulation scenarios
-func runSimulationsExample() {
- glog.Infof("\n--- Example 3: Running Simulation Scenarios ---")
-
- // Note: Simulation framework moved to simulation package
- // To use: simulationRunner := simulation.NewComprehensiveSimulationRunner()
- // simulationRunner.RunAllComprehensiveTests()
-
- glog.Infof("✅ Simulation framework available in separate package")
- glog.Infof("Use simulation.NewComprehensiveSimulationRunner() to access comprehensive testing")
-}
-
-// demonstrateFeaturesExample shows key system features
-func demonstrateFeaturesExample() {
- glog.Infof("\n--- Example 4: Key System Features ---")
-
- // Feature 1: Task Discovery
- demonstrateTaskDiscovery()
-
- // Feature 2: Volume State Tracking
- demonstrateVolumeStateTracking()
-
- // Feature 3: Failure Handling
- demonstrateFailureHandling()
-
- // Feature 4: Task Scheduling
- demonstrateTaskScheduling()
-}
-
-// demonstrateTaskDiscovery shows how task discovery works
-func demonstrateTaskDiscovery() {
- glog.Infof("\n Feature 1: Task Discovery")
-
- // Create mock volumes
- volumes := []*VolumeInfo{
- {
- ID: 1,
- Size: 28 * 1024 * 1024 * 1024, // 28GB (93% of 30GB)
- Collection: "photos",
- DeletedByteCount: 0,
- ReadOnly: false,
- ModifiedAtSecond: time.Now().Add(-2 * time.Hour).Unix(), // 2 hours old
- },
- {
- ID: 2,
- Size: 20 * 1024 * 1024 * 1024, // 20GB
- Collection: "documents",
- DeletedByteCount: 8 * 1024 * 1024 * 1024, // 8GB garbage (40%)
- ReadOnly: false,
- ModifiedAtSecond: time.Now().Add(-1 * time.Hour).Unix(), // 1 hour old
- },
- }
-
- // Create detectors
- ecDetector := NewECDetector()
- vacuumDetector := NewVacuumDetector()
-
- // Test EC detection
- ecCandidates, _ := ecDetector.DetectECCandidates(volumes)
- glog.Infof(" ✓ EC detector found %d candidates", len(ecCandidates))
- for _, candidate := range ecCandidates {
- glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority)
- }
-
- // Test vacuum detection
- vacuumCandidates, _ := vacuumDetector.DetectVacuumCandidates(volumes)
- glog.Infof(" ✓ Vacuum detector found %d candidates", len(vacuumCandidates))
- for _, candidate := range vacuumCandidates {
- glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority)
- }
-}
-
-// demonstrateVolumeStateTracking shows volume state management
-func demonstrateVolumeStateTracking() {
- glog.Infof("\n Feature 2: Volume State Tracking")
-
- // Create volume state tracker
- tracker := NewVolumeStateTracker(nil, 5*time.Minute)
-
- // Reserve volumes for tasks
- tracker.ReserveVolume(1, "task-ec-001")
- tracker.ReserveVolume(2, "task-vacuum-001")
-
- glog.Infof(" ✓ Reserved volumes for tasks")
-
- // Check reservations
- if tracker.IsVolumeReserved(1) {
- glog.Infof(" ✓ Volume 1 is correctly reserved")
- }
-
- // Record volume changes
- tracker.RecordVolumeChange(1, types.TaskTypeErasureCoding, "task-ec-001")
- glog.Infof(" ✓ Recorded volume change for EC completion")
-
- // Get pending changes
- if change := tracker.GetPendingChange(1); change != nil {
- glog.Infof(" ✓ Pending change found: %s for volume %d", change.ChangeType, change.VolumeID)
- }
-
- // Release reservation
- tracker.ReleaseVolume(2, "task-vacuum-001")
- glog.Infof(" ✓ Released volume reservation")
-
- // Show statistics
- stats := tracker.GetStats()
- glog.Infof(" ✓ Tracker statistics: %+v", stats)
-}
-
-// demonstrateFailureHandling shows failure recovery mechanisms
-func demonstrateFailureHandling() {
- glog.Infof("\n Feature 3: Failure Handling")
-
- // Create failure handler
- config := DefaultAdminConfig()
- handler := NewFailureHandler(config)
-
- // Create mock task
- task := &InProgressTask{
- Task: &types.Task{
- ID: "test-task-001",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 1,
- RetryCount: 0,
- },
- WorkerID: "worker-01",
- StartedAt: time.Now(),
- LastUpdate: time.Now().Add(-30 * time.Minute), // 30 minutes ago
- Progress: 45.0,
- }
-
- // Demonstrate different failure scenarios
- glog.Infof(" ✓ Simulating worker timeout scenario")
- handler.HandleWorkerTimeout("worker-01", []*InProgressTask{task})
-
- glog.Infof(" ✓ Simulating stuck task scenario")
- handler.HandleTaskStuck(task)
-
- glog.Infof(" ✓ Simulating duplicate task detection")
- handler.HandleDuplicateTask("existing-task", "duplicate-task", 1)
-
- // Show failure statistics
- stats := handler.GetFailureStats()
- glog.Infof(" ✓ Failure handler statistics: %+v", stats)
-}
-
-// demonstrateTaskScheduling shows task scheduling logic
-func demonstrateTaskScheduling() {
- glog.Infof("\n Feature 4: Task Scheduling")
-
- // Create worker registry and task queue
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Add mock worker
- worker := &types.Worker{
- ID: "scheduler-worker-01",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Create mock tasks with different priorities
- highPriorityTask := &types.Task{
- ID: "high-priority-task",
- Type: types.TaskTypeErasureCoding,
- Priority: types.TaskPriorityHigh,
- VolumeID: 1,
- }
-
- normalPriorityTask := &types.Task{
- ID: "normal-priority-task",
- Type: types.TaskTypeVacuum,
- Priority: types.TaskPriorityNormal,
- VolumeID: 2,
- }
-
- // Add tasks to queue
- queue.Push(normalPriorityTask)
- queue.Push(highPriorityTask) // Should be prioritized
-
- glog.Infof(" ✓ Added tasks to priority queue (size: %d)", queue.Size())
-
- // Test worker selection
- selectedWorker := scheduler.SelectWorker(highPriorityTask, []*types.Worker{worker})
- if selectedWorker != nil {
- glog.Infof(" ✓ Selected worker %s for high-priority task", selectedWorker.ID)
- }
-
- // Test task retrieval
- nextTask := scheduler.GetNextTask("scheduler-worker-01", []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum})
- if nextTask != nil {
- glog.Infof(" ✓ Next task for worker: %s (priority: %d)", nextTask.ID, nextTask.Priority)
- }
-
- glog.Infof(" ✓ Task scheduling demonstration complete")
-}
-
-// RunComprehensiveDemo runs a full demonstration of the system
-func RunComprehensiveDemo() {
- glog.Infof("Starting comprehensive task distribution system demonstration...")
-
- // Run comprehensive example
- ExampleUsage()
-
- // Note: To run the comprehensive simulation framework, use:
- // simulationRunner := simulation.NewComprehensiveSimulationRunner()
- // simulationRunner.RunAllComprehensiveTests()
-
- glog.Infof("=== Comprehensive demonstration complete ===")
- glog.Infof("💡 To run comprehensive simulations, use the simulation package separately")
- glog.Infof("Step 9: Comprehensive Simulation Testing")
- glog.Infof("Note: Simulation framework moved to separate 'simulation' package")
- glog.Infof("To run simulations: simulation.NewComprehensiveSimulationRunner().RunAllComprehensiveTests()")
- glog.Infof("✅ Simulation framework available in separate package")
- glog.Infof("")
-}
diff --git a/weed/admin/task/failure_handler.go b/weed/admin/task/failure_handler.go
deleted file mode 100644
index 651d9db88..000000000
--- a/weed/admin/task/failure_handler.go
+++ /dev/null
@@ -1,123 +0,0 @@
-package task
-
-import (
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
-)
-
-// FailureHandler handles various failure scenarios in the task system
-type FailureHandler struct {
- config *AdminConfig
-}
-
-// NewFailureHandler creates a new failure handler
-func NewFailureHandler(config *AdminConfig) *FailureHandler {
- return &FailureHandler{
- config: config,
- }
-}
-
-// HandleWorkerTimeout handles worker timeout scenarios
-func (fh *FailureHandler) HandleWorkerTimeout(workerID string, affectedTasks []*InProgressTask) {
- glog.Warningf("Handling worker timeout for worker %s with %d affected tasks", workerID, len(affectedTasks))
-
- for _, task := range affectedTasks {
- fh.handleTaskFailure(task, "worker_timeout", "Worker became unresponsive")
- }
-}
-
-// HandleTaskStuck handles stuck task scenarios
-func (fh *FailureHandler) HandleTaskStuck(task *InProgressTask) {
- glog.Warningf("Handling stuck task %s (no progress for %v)", task.Task.ID, time.Since(task.LastUpdate))
-
- fh.handleTaskFailure(task, "task_stuck", "Task made no progress within timeout period")
-}
-
-// HandleTaskFailure handles general task failure scenarios
-func (fh *FailureHandler) HandleTaskFailure(task *InProgressTask, reason string, details string) {
- glog.Errorf("Handling task failure for task %s: %s - %s", task.Task.ID, reason, details)
-
- fh.handleTaskFailure(task, reason, details)
-}
-
-// handleTaskFailure is the internal handler for task failures
-func (fh *FailureHandler) handleTaskFailure(task *InProgressTask, reason string, details string) {
- // Record failure reason
- task.Task.Error = details
-
- // Determine if task should be retried
- if task.Task.RetryCount < fh.config.MaxRetries {
- fh.scheduleRetry(task, reason)
- } else {
- fh.markTaskFailed(task, reason)
- }
-}
-
-// scheduleRetry schedules a task for retry
-func (fh *FailureHandler) scheduleRetry(task *InProgressTask, reason string) {
- task.Task.RetryCount++
-
- // Calculate retry delay with exponential backoff
- retryDelay := time.Duration(task.Task.RetryCount) * 5 * time.Minute
- task.Task.ScheduledAt = time.Now().Add(retryDelay)
-
- glog.Infof("Scheduling retry %d/%d for task %s (reason: %s, delay: %v)",
- task.Task.RetryCount, fh.config.MaxRetries, task.Task.ID, reason, retryDelay)
-}
-
-// markTaskFailed permanently marks a task as failed
-func (fh *FailureHandler) markTaskFailed(task *InProgressTask, reason string) {
- glog.Errorf("Task %s permanently failed after %d retries (reason: %s)",
- task.Task.ID, task.Task.RetryCount, reason)
-
- // Could trigger alerts or notifications here
- fh.sendFailureAlert(task, reason)
-}
-
-// sendFailureAlert sends alerts for permanently failed tasks
-func (fh *FailureHandler) sendFailureAlert(task *InProgressTask, reason string) {
- // In a real implementation, this would:
- // 1. Send notifications to administrators
- // 2. Update monitoring dashboards
- // 3. Log to audit trails
- // 4. Possibly trigger automatic remediation
-
- glog.Errorf("ALERT: Task permanently failed - ID: %s, Type: %s, Volume: %d, Reason: %s",
- task.Task.ID, task.Task.Type, task.Task.VolumeID, reason)
-}
-
-// HandleDuplicateTask handles duplicate task detection
-func (fh *FailureHandler) HandleDuplicateTask(existingTaskID string, duplicateTaskID string, volumeID uint32) {
- glog.Warningf("Detected duplicate task for volume %d: existing=%s, duplicate=%s",
- volumeID, existingTaskID, duplicateTaskID)
-
- // Cancel the duplicate task
- // In a real implementation, this would send a cancellation signal
-}
-
-// HandleResourceExhaustion handles resource exhaustion scenarios
-func (fh *FailureHandler) HandleResourceExhaustion(workerID string, taskType string) {
- glog.Warningf("Worker %s reported resource exhaustion for task type %s", workerID, taskType)
-
- // Could implement:
- // 1. Temporary worker blacklisting
- // 2. Task redistribution
- // 3. Resource monitoring alerts
-}
-
-// GetFailureStats returns failure statistics
-func (fh *FailureHandler) GetFailureStats() map[string]interface{} {
- // In a real implementation, this would track:
- // - Failure rates by type
- // - Worker reliability scores
- // - Task retry statistics
- // - System health metrics
-
- return map[string]interface{}{
- "enabled": true,
- "max_retries": fh.config.MaxRetries,
- "task_timeout": fh.config.TaskTimeout.String(),
- "worker_timeout": fh.config.WorkerTimeout.String(),
- }
-}
diff --git a/weed/admin/task/master_sync.go b/weed/admin/task/master_sync.go
deleted file mode 100644
index 5d094f052..000000000
--- a/weed/admin/task/master_sync.go
+++ /dev/null
@@ -1,486 +0,0 @@
-package task
-
-import (
- "context"
- "fmt"
- "strconv"
- "strings"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
- "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// MasterSynchronizer handles periodic synchronization with the master server
-type MasterSynchronizer struct {
- masterClient *wdclient.MasterClient
- volumeStateManager *VolumeStateManager
- adminServer *AdminServer
- syncInterval time.Duration
- stopCh chan struct{}
- volumeSizeLimitMB uint64 // Volume size limit from master in MB
-}
-
-// NewMasterSynchronizer creates a new master synchronizer
-func NewMasterSynchronizer(masterClient *wdclient.MasterClient, vsm *VolumeStateManager, admin *AdminServer) *MasterSynchronizer {
- return &MasterSynchronizer{
- masterClient: masterClient,
- volumeStateManager: vsm,
- adminServer: admin,
- syncInterval: 30 * time.Second, // Default 30 second sync interval
- stopCh: make(chan struct{}),
- }
-}
-
-// Start begins the periodic master synchronization
-func (ms *MasterSynchronizer) Start() {
- glog.Infof("Starting master synchronization with interval %v", ms.syncInterval)
-
- go func() {
- // Immediate sync on startup
- ms.performSync()
-
- ticker := time.NewTicker(ms.syncInterval)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- ms.performSync()
- case <-ms.stopCh:
- glog.Infof("Master synchronization stopped")
- return
- }
- }
- }()
-}
-
-// Stop stops the master synchronization
-func (ms *MasterSynchronizer) Stop() {
- close(ms.stopCh)
-}
-
-// performSync executes a single synchronization cycle
-func (ms *MasterSynchronizer) performSync() {
- glog.V(1).Infof("Starting master sync cycle")
- startTime := time.Now()
-
- // Get volume list from master
- volumeData, err := ms.getVolumeListFromMaster()
- if err != nil {
- glog.Errorf("Failed to get volume list from master: %v", err)
- return
- }
-
- // Update volume size limit from master
- if volumeData.VolumeSizeLimitMb > 0 {
- ms.volumeSizeLimitMB = volumeData.VolumeSizeLimitMb
- glog.V(2).Infof("Updated volume size limit to %d MB from master", ms.volumeSizeLimitMB)
- }
-
- // Merge data into volume state manager
- err = ms.mergeVolumeData(volumeData)
- if err != nil {
- glog.Errorf("Failed to merge volume data: %v", err)
- return
- }
-
- // Detect volumes needing work
- candidates := ms.detectMaintenanceCandidates(volumeData)
-
- // Process candidates for task assignment
- ms.processCandidates(candidates)
-
- duration := time.Since(startTime)
- glog.V(1).Infof("Master sync completed in %v, found %d maintenance candidates",
- duration, len(candidates))
-}
-
-// getVolumeListFromMaster retrieves the current volume topology from master
-func (ms *MasterSynchronizer) getVolumeListFromMaster() (*master_pb.VolumeListResponse, error) {
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- err := ms.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
- req := &master_pb.VolumeListRequest{}
- response, err := client.VolumeList(ctx, req)
- if err != nil {
- return fmt.Errorf("VolumeList RPC failed: %v", err)
- }
- volumeData = response
- return nil
- })
-
- if err != nil {
- return nil, err
- }
-
- return volumeData, nil
-}
-
-// VolumeMaintenanceCandidate represents a volume that needs maintenance
-type VolumeMaintenanceCandidate struct {
- VolumeID uint32
- Server string
- TaskType string
- Priority TaskPriority
- Reason string
- VolumeInfo *VolumeInfo
- ECShardInfo map[int]*ShardInfo
-}
-
-// mergeVolumeData merges master volume data into the volume state manager
-func (ms *MasterSynchronizer) mergeVolumeData(data *master_pb.VolumeListResponse) error {
- if data.TopologyInfo == nil {
- return fmt.Errorf("empty topology info from master")
- }
-
- volumes := make(map[uint32]*VolumeInfo)
- ecShards := make(map[uint32]map[int]*ShardInfo)
- serverCapacity := make(map[string]*CapacityInfo)
-
- // Extract volume information from topology
- ms.extractVolumesFromTopology(data.TopologyInfo, volumes, ecShards, serverCapacity)
-
- // Update volume state manager
- err := ms.volumeStateManager.SyncWithMasterData(volumes, ecShards, serverCapacity)
- if err != nil {
- return fmt.Errorf("failed to sync with volume state manager: %v", err)
- }
-
- glog.V(2).Infof("Synced %d volumes, %d EC volume groups, %d servers",
- len(volumes), len(ecShards), len(serverCapacity))
-
- return nil
-}
-
-// extractVolumesFromTopology extracts volume and capacity data from master topology
-func (ms *MasterSynchronizer) extractVolumesFromTopology(
- topology *master_pb.TopologyInfo,
- volumes map[uint32]*VolumeInfo,
- ecShards map[uint32]map[int]*ShardInfo,
- serverCapacity map[string]*CapacityInfo) {
-
- for _, dcInfo := range topology.DataCenterInfos {
- for _, rackInfo := range dcInfo.RackInfos {
- for _, nodeInfo := range rackInfo.DataNodeInfos {
- serverID := fmt.Sprintf("%s:%d", nodeInfo.Id, nodeInfo.GrpcPort)
-
- // Initialize server capacity info
- if serverCapacity[serverID] == nil {
- serverCapacity[serverID] = &CapacityInfo{
- Server: serverID,
- }
- }
-
- // Process disk information
- for diskType, diskInfo := range nodeInfo.DiskInfos {
- ms.processDiskInfo(diskInfo, diskType, serverID, volumes, ecShards, serverCapacity)
- }
- }
- }
- }
-}
-
-// processDiskInfo processes disk information for a specific server
-func (ms *MasterSynchronizer) processDiskInfo(
- diskInfo *master_pb.DiskInfo,
- diskType string,
- serverID string,
- volumes map[uint32]*VolumeInfo,
- ecShards map[uint32]map[int]*ShardInfo,
- serverCapacity map[string]*CapacityInfo) {
-
- // Update capacity information
- capacity := serverCapacity[serverID]
- volumeSizeBytes := int64(ms.volumeSizeLimitMB) * 1024 * 1024 // Convert MB to bytes
- capacity.TotalCapacity += int64(diskInfo.MaxVolumeCount) * volumeSizeBytes
- capacity.UsedCapacity += int64(diskInfo.ActiveVolumeCount) * volumeSizeBytes
-
- // Process regular volumes
- for _, volInfo := range diskInfo.VolumeInfos {
- volumes[volInfo.Id] = &VolumeInfo{
- ID: volInfo.Id,
- Size: volInfo.Size,
- Collection: volInfo.Collection,
- FileCount: volInfo.FileCount,
- DeleteCount: volInfo.DeleteCount,
- DeletedByteCount: volInfo.DeletedByteCount,
- ReadOnly: volInfo.ReadOnly,
- Server: serverID,
- DiskType: diskType,
- ModifiedAtSecond: volInfo.ModifiedAtSecond,
- }
- }
-
- // Process EC shards
- for _, shardInfo := range diskInfo.EcShardInfos {
- volumeID := shardInfo.Id
- if ecShards[volumeID] == nil {
- ecShards[volumeID] = make(map[int]*ShardInfo)
- }
-
- // Extract shard IDs from ec_index_bits
- for shardID := 0; shardID < 14; shardID++ {
- if (shardInfo.EcIndexBits & (1 << uint(shardID))) != 0 {
- ecShards[volumeID][shardID] = &ShardInfo{
- ShardID: shardID,
- Server: serverID,
- Status: ShardStatusExists,
- Size: 0, // Size not available in shard info
- }
- }
- }
- }
-}
-
-// detectMaintenanceCandidates identifies volumes that need maintenance
-func (ms *MasterSynchronizer) detectMaintenanceCandidates(data *master_pb.VolumeListResponse) []*VolumeMaintenanceCandidate {
- var candidates []*VolumeMaintenanceCandidate
-
- // Get current volume states
- currentVolumes := ms.volumeStateManager.GetAllVolumeStates()
-
- for volumeID, volumeState := range currentVolumes {
- // Skip volumes with in-progress tasks
- if len(volumeState.InProgressTasks) > 0 {
- continue
- }
-
- // Check for EC encoding candidates
- if candidate := ms.checkECEncodingCandidate(volumeID, volumeState); candidate != nil {
- candidates = append(candidates, candidate)
- }
-
- // Check for vacuum candidates
- if candidate := ms.checkVacuumCandidate(volumeID, volumeState); candidate != nil {
- candidates = append(candidates, candidate)
- }
-
- // Check for EC rebuild candidates
- if candidate := ms.checkECRebuildCandidate(volumeID, volumeState); candidate != nil {
- candidates = append(candidates, candidate)
- }
- }
-
- return candidates
-}
-
-// EC encoding criteria - using configuration from EC detector
-func (ms *MasterSynchronizer) checkECEncodingCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
- volume := state.CurrentState
- if volume == nil {
- return nil
- }
-
- // Get the current configuration from the EC detector
- ecDetector, _ := erasure_coding.GetSharedInstances()
- if ecDetector == nil || !ecDetector.IsEnabled() {
- return nil
- }
-
- // Get configuration values from the detector
- fullnessThreshold := ecDetector.GetFullnessRatio()
- quietForSeconds := ecDetector.GetQuietForSeconds()
- collectionFilter := ecDetector.GetCollectionFilter()
-
- // EC encoding criteria:
- // 1. Volume meets fullness ratio threshold
- // 2. Volume has been quiet for required duration
- // 3. Collection filter matches (if specified)
- // 4. Not already EC encoded
-
- // Check fullness ratio (if we have size info)
- if volume.Size == 0 {
- return nil
- }
-
- // Calculate fullness ratio (assuming total capacity is close to actual size for near-full volumes)
- // For a more accurate calculation, we'd need the volume's max capacity
- fullnessRatio := float64(volume.Size-volume.DeletedByteCount) / float64(volume.Size)
- if fullnessRatio < fullnessThreshold {
- return nil
- }
-
- // Check collection filter if specified
- if collectionFilter != "" {
- // Parse comma-separated collections
- allowedCollections := make(map[string]bool)
- for _, collection := range strings.Split(collectionFilter, ",") {
- allowedCollections[strings.TrimSpace(collection)] = true
- }
- // Skip if volume's collection is not in the allowed list
- if !allowedCollections[volume.Collection] {
- return nil
- }
- }
-
- // Check quiet duration using volume's last modification time
- now := time.Now()
- lastModified := time.Unix(volume.ModifiedAtSecond, 0)
- timeSinceModification := now.Sub(lastModified)
-
- if timeSinceModification < time.Duration(quietForSeconds)*time.Second {
- return nil // Volume hasn't been quiet long enough
- }
-
- return &VolumeMaintenanceCandidate{
- VolumeID: volumeID,
- Server: volume.Server,
- TaskType: "ec_encode",
- Priority: types.TaskPriorityLow, // EC is typically low priority
- Reason: fmt.Sprintf("Volume meets EC criteria: fullness=%.1f%% (>%.1f%%), quiet for %s (>%ds), collection='%s'", fullnessRatio*100, fullnessThreshold*100, timeSinceModification.Truncate(time.Second), quietForSeconds, volume.Collection),
- VolumeInfo: volume,
- }
-}
-
-// checkVacuumCandidate checks if a volume is a candidate for vacuum
-func (ms *MasterSynchronizer) checkVacuumCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
- volume := state.CurrentState
- if volume == nil || volume.ReadOnly {
- return nil
- }
-
- // Get the current configuration from the vacuum detector
- vacuumDetector, _ := vacuum.GetSharedInstances()
- if vacuumDetector == nil || !vacuumDetector.IsEnabled() {
- return nil
- }
-
- // Get configuration values from the detector
- garbageThreshold := vacuumDetector.GetGarbageThreshold()
- minVolumeAge := vacuumDetector.GetMinVolumeAge()
-
- // Vacuum criteria:
- // 1. Volume meets garbage threshold
- // 2. Volume is old enough (respects minimum age)
- // 3. Volume has sufficient size
-
- // Check minimum volume size (avoid vacuum on tiny volumes)
- if volume.Size == 0 {
- return nil
- }
-
- // Check garbage ratio
- deletedRatio := float64(volume.DeletedByteCount) / float64(volume.Size)
- if deletedRatio < garbageThreshold {
- return nil
- }
-
- // Check minimum volume age using volume's last modification time
- now := time.Now()
- lastModified := time.Unix(volume.ModifiedAtSecond, 0)
- volumeAge := now.Sub(lastModified)
-
- if volumeAge < minVolumeAge {
- return nil // Volume is too new for vacuum
- }
-
- // Determine priority based on garbage ratio
- priority := types.TaskPriorityNormal
- if deletedRatio > 0.6 { // High garbage ratio gets higher priority
- priority = types.TaskPriorityHigh
- }
-
- return &VolumeMaintenanceCandidate{
- VolumeID: volumeID,
- Server: volume.Server,
- TaskType: "vacuum",
- Priority: priority,
- Reason: fmt.Sprintf("Volume meets vacuum criteria: garbage=%.1f%% (>%.1f%%), age=%s (>%s)",
- deletedRatio*100, garbageThreshold*100, volumeAge.Truncate(time.Second), minVolumeAge.Truncate(time.Second)),
- VolumeInfo: volume,
- }
-}
-
-// checkECRebuildCandidate checks if an EC volume needs shard rebuilding
-func (ms *MasterSynchronizer) checkECRebuildCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
- // For now, skip EC rebuild detection as it requires more complex shard state tracking
- // This would be implemented when the volume state manager provides proper EC shard access
- return nil
-}
-
-// processCandidates attempts to assign tasks for maintenance candidates
-func (ms *MasterSynchronizer) processCandidates(candidates []*VolumeMaintenanceCandidate) {
- for _, candidate := range candidates {
- // Check if we can assign this task
- if !ms.canAssignCandidate(candidate) {
- glog.V(2).Infof("Cannot assign task for volume %d: insufficient capacity or no workers",
- candidate.VolumeID)
- continue
- }
-
- // Create and queue the task
- task := ms.createTaskFromCandidate(candidate)
- if task != nil {
- ms.adminServer.QueueTask(task)
- glog.V(1).Infof("Queued %s task for volume %d on server %s: %s",
- candidate.TaskType, candidate.VolumeID, candidate.Server, candidate.Reason)
- }
- }
-}
-
-// canAssignCandidate checks if a candidate can be assigned (capacity, workers available)
-func (ms *MasterSynchronizer) canAssignCandidate(candidate *VolumeMaintenanceCandidate) bool {
- // Check if server has capacity for the task
- if candidate.TaskType == "ec_encode" {
- // EC encoding requires significant temporary space
- requiredSpace := int64(candidate.VolumeInfo.Size * 2) // Estimate 2x volume size needed
- if !ms.volumeStateManager.CanAssignVolumeToServer(requiredSpace, candidate.Server) {
- return false
- }
- }
-
- // Check if we have workers capable of this task type
- availableWorkers := ms.adminServer.GetAvailableWorkers(candidate.TaskType)
- if len(availableWorkers) == 0 {
- return false
- }
-
- return true
-}
-
-// createTaskFromCandidate creates a task from a maintenance candidate
-func (ms *MasterSynchronizer) createTaskFromCandidate(candidate *VolumeMaintenanceCandidate) *Task {
- now := time.Now()
-
- task := &Task{
- ID: generateTaskID(),
- Type: TaskType(candidate.TaskType),
- VolumeID: candidate.VolumeID,
- Priority: candidate.Priority,
- Status: types.TaskStatusPending,
- CreatedAt: now,
- Parameters: map[string]interface{}{
- "volume_id": fmt.Sprintf("%d", candidate.VolumeID),
- "server": candidate.Server,
- "reason": candidate.Reason,
- },
- }
-
- // Add task-specific parameters
- switch candidate.TaskType {
- case "ec_encode":
- task.Parameters["replication"] = "001" // Default replication for EC
- task.Parameters["collection"] = candidate.VolumeInfo.Collection
- case "vacuum":
- // Get the current garbage threshold from the vacuum detector
- vacuumDetector, _ := vacuum.GetSharedInstances()
- var garbageThreshold float64 = 0.3 // Default fallback
- if vacuumDetector != nil {
- garbageThreshold = vacuumDetector.GetGarbageThreshold()
- }
- task.Parameters["garbage_threshold"] = strconv.FormatFloat(garbageThreshold, 'f', -1, 64)
- case "ec_rebuild":
- // Add info about which shards need rebuilding
- }
-
- return task
-}
-
-// Global variable to hold the master volume data
-var volumeData *master_pb.VolumeListResponse
diff --git a/weed/admin/task/minimal_admin_server.go b/weed/admin/task/minimal_admin_server.go
deleted file mode 100644
index d7dbfcd96..000000000
--- a/weed/admin/task/minimal_admin_server.go
+++ /dev/null
@@ -1,324 +0,0 @@
-package task
-
-import (
- "fmt"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// MinimalAdminConfig contains configuration for the minimal admin server
-type MinimalAdminConfig struct {
- ScanInterval time.Duration
- WorkerTimeout time.Duration
- TaskTimeout time.Duration
- MaxRetries int
- ReconcileInterval time.Duration
- EnableFailureRecovery bool
- MaxConcurrentTasks int
-}
-
-// MinimalAdminServer manages workers and tasks with a simple implementation
-type MinimalAdminServer struct {
- config *MinimalAdminConfig
- masterClient *wdclient.MasterClient
- running bool
- mutex sync.RWMutex
-
- // Task management
- tasks map[string]*types.Task
- taskQueue []*types.Task
- activeTasks map[string]*types.Task
-
- // Worker management
- workers map[string]*types.Worker
- workerStatus map[string]*types.WorkerStatus
-
- // Task history
- taskHistory []MinimalTaskHistoryEntry
-}
-
-// MinimalTaskHistoryEntry represents a single task history entry
-type MinimalTaskHistoryEntry struct {
- TaskID string
- TaskType types.TaskType
- VolumeID uint32
- WorkerID string
- Status types.TaskStatus
- StartedAt time.Time
- CompletedAt time.Time
- Duration time.Duration
- ErrorMessage string
-}
-
-// MinimalSystemStats represents system statistics
-type MinimalSystemStats struct {
- ActiveTasks int
- QueuedTasks int
- ActiveWorkers int
- TotalTasks int
-}
-
-// NewMinimalAdminServer creates a new minimal admin server
-func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer {
- return &MinimalAdminServer{
- config: config,
- masterClient: masterClient,
- tasks: make(map[string]*types.Task),
- taskQueue: make([]*types.Task, 0),
- activeTasks: make(map[string]*types.Task),
- workers: make(map[string]*types.Worker),
- workerStatus: make(map[string]*types.WorkerStatus),
- taskHistory: make([]MinimalTaskHistoryEntry, 0),
- }
-}
-
-// Start starts the minimal admin server
-func (as *MinimalAdminServer) Start() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if as.running {
- return fmt.Errorf("admin server is already running")
- }
-
- as.running = true
- return nil
-}
-
-// Stop stops the minimal admin server
-func (as *MinimalAdminServer) Stop() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- as.running = false
- return nil
-}
-
-// RegisterWorker registers a new worker
-func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- as.workers[worker.ID] = worker
- as.workerStatus[worker.ID] = &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- return nil
-}
-
-// QueueTask adds a new task to the task queue
-func (as *MinimalAdminServer) QueueTask(task *types.Task) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- if task.ID == "" {
- task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
- }
-
- task.Status = types.TaskStatusPending
- task.CreatedAt = time.Now()
-
- as.tasks[task.ID] = task
- as.taskQueue = append(as.taskQueue, task)
-
- return nil
-}
-
-// RequestTask requests a task for a worker
-func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return nil, fmt.Errorf("admin server is not running")
- }
-
- // Check if worker exists
- worker, exists := as.workers[workerID]
- if !exists {
- return nil, fmt.Errorf("worker %s not found", workerID)
- }
-
- // Check if worker has capacity
- status := as.workerStatus[workerID]
- if status.CurrentLoad >= worker.MaxConcurrent {
- return nil, nil // No capacity
- }
-
- // Find a suitable task
- for i, task := range as.taskQueue {
- if task.Status != types.TaskStatusPending {
- continue
- }
-
- // Check if worker can handle this task type
- canHandle := false
- for _, capability := range capabilities {
- if task.Type == capability {
- canHandle = true
- break
- }
- }
-
- if canHandle {
- // Assign task to worker
- task.Status = types.TaskStatusInProgress
- task.WorkerID = workerID
- now := time.Now()
- task.StartedAt = &now
-
- // Move task from queue to active tasks
- as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
- as.activeTasks[task.ID] = task
-
- // Update worker load
- status.CurrentLoad++
-
- return task, nil
- }
- }
-
- return nil, nil // No suitable task found
-}
-
-// UpdateTaskProgress updates task progress
-func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- task.Progress = progress
-
- return nil
-}
-
-// CompleteTask marks a task as completed
-func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- // Update task status
- if success {
- task.Status = types.TaskStatusCompleted
- } else {
- task.Status = types.TaskStatusFailed
- task.Error = errorMessage
- }
-
- now := time.Now()
- task.CompletedAt = &now
-
- // Remove from active tasks
- delete(as.activeTasks, taskID)
-
- // Update worker load
- if task.WorkerID != "" {
- if status, exists := as.workerStatus[task.WorkerID]; exists {
- status.CurrentLoad--
- }
- }
-
- // Add to history
- var duration time.Duration
- if task.StartedAt != nil {
- duration = now.Sub(*task.StartedAt)
- }
-
- entry := MinimalTaskHistoryEntry{
- TaskID: task.ID,
- TaskType: task.Type,
- VolumeID: task.VolumeID,
- WorkerID: task.WorkerID,
- Status: task.Status,
- StartedAt: *task.StartedAt,
- CompletedAt: now,
- Duration: duration,
- ErrorMessage: errorMessage,
- }
- as.taskHistory = append(as.taskHistory, entry)
-
- return nil
-}
-
-// UpdateWorkerHeartbeat updates worker heartbeat
-func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- worker, exists := as.workers[workerID]
- if !exists {
- return fmt.Errorf("worker %s not found", workerID)
- }
-
- worker.LastHeartbeat = time.Now()
- as.workerStatus[workerID] = status
-
- return nil
-}
-
-// GetSystemStats returns system statistics
-func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- activeWorkers := 0
- for _, status := range as.workerStatus {
- if status.Status == "active" {
- activeWorkers++
- }
- }
-
- return &MinimalSystemStats{
- ActiveTasks: len(as.activeTasks),
- QueuedTasks: len(as.taskQueue),
- ActiveWorkers: activeWorkers,
- TotalTasks: len(as.tasks),
- }
-}
-
-// GetQueuedTaskCount returns the number of queued tasks
-func (as *MinimalAdminServer) GetQueuedTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.taskQueue)
-}
-
-// GetActiveTaskCount returns the number of active tasks
-func (as *MinimalAdminServer) GetActiveTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.activeTasks)
-}
-
-// GetTaskHistory returns task history
-func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- // Return a copy of the history
- history := make([]MinimalTaskHistoryEntry, len(as.taskHistory))
- copy(history, as.taskHistory)
- return history
-}
diff --git a/weed/admin/task/minimal_integration_test.go b/weed/admin/task/minimal_integration_test.go
deleted file mode 100644
index c690456ef..000000000
--- a/weed/admin/task/minimal_integration_test.go
+++ /dev/null
@@ -1,434 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
-func TestMinimalIntegration(t *testing.T) {
- t.Logf("Starting minimal integration test")
-
- // Step 1: Create a minimal admin server configuration
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- // Step 2: Create minimal admin server with nil master client (for testing)
- adminServer := NewMinimalAdminServer(config, nil)
-
- // Step 3: Start admin server
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 4: Test worker registration
- t.Logf("Testing worker registration")
-
- worker := &types.Worker{
- ID: "test-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
- t.Logf("Successfully registered worker %s", worker.ID)
-
- // Step 5: Test task queueing
- t.Logf("Testing task queueing")
-
- task := &types.Task{
- ID: "test-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task: %v", err)
- }
- t.Logf("Successfully queued task %s", task.ID)
-
- // Step 6: Test task request by worker
- t.Logf("Testing task request")
-
- assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Fatalf("Failed to request task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
-
- // Step 7: Test task progress updates
- t.Logf("Testing task progress updates")
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 25%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 50%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 75%%: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update task progress to 100%%: %v", err)
- }
-
- // Step 8: Test task completion
- t.Logf("Testing task completion")
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("Successfully completed task %s", assignedTask.ID)
- } else {
- t.Logf("No task was assigned (queue might be empty)")
- }
-
- // Step 9: Test basic metrics
- t.Logf("Testing basic metrics")
-
- stats := adminServer.GetSystemStats()
- if stats != nil {
- t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
- }
-
- queuedCount := adminServer.GetQueuedTaskCount()
- activeCount := adminServer.GetActiveTaskCount()
- t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
-
- // Step 10: Test task history
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d entries", len(history))
-
- if len(history) > 0 {
- lastEntry := history[len(history)-1]
- t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v",
- lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration)
- }
-
- t.Logf("Minimal integration test completed successfully")
-}
-
-// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
-func TestMinimalWorkerHeartbeat(t *testing.T) {
- t.Logf("Testing minimal worker heartbeat")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register a worker
- worker := &types.Worker{
- ID: "heartbeat-worker",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
-
- // Test heartbeat update
- status := &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
- if err != nil {
- t.Errorf("Failed to update worker heartbeat: %v", err)
- }
-
- t.Logf("Minimal worker heartbeat test completed successfully")
-}
-
-// TestMinimalTaskQueueOperations tests task queue operations
-func TestMinimalTaskQueueOperations(t *testing.T) {
- t.Logf("Testing minimal task queue operations")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Test queuing multiple tasks
- taskCount := 3
- for i := 0; i < taskCount; i++ {
- task := &types.Task{
- ID: fmt.Sprintf("queue-test-task-%d", i),
- Type: types.TaskTypeVacuum,
- VolumeID: uint32(2000 + i),
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Errorf("Failed to queue task %d: %v", i, err)
- }
- }
-
- // Check queue size
- queuedCount := adminServer.GetQueuedTaskCount()
- if queuedCount != taskCount {
- t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount)
- }
-
- t.Logf("Minimal task queue operations test completed successfully")
-}
-
-// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
-func TestMinimalFullWorkflow(t *testing.T) {
- t.Logf("Testing minimal full workflow")
-
- config := &MinimalAdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewMinimalAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register multiple workers with different capabilities
- workers := []*types.Worker{
- {
- ID: "vacuum-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "ec-worker-1",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- {
- ID: "multi-worker-1",
- Address: "localhost:9003",
- Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
- MaxConcurrent: 3,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- },
- }
-
- for _, worker := range workers {
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
- }
- t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
- }
-
- // Create multiple tasks of different types
- tasks := []*types.Task{
- {
- ID: "vacuum-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 3001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.4",
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "ec-task-1",
- Type: types.TaskTypeErasureCoding,
- VolumeID: 3002,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityHigh,
- Parameters: map[string]interface{}{
- "shard_count": "14",
- },
- CreatedAt: time.Now(),
- },
- {
- ID: "vacuum-task-2",
- Type: types.TaskTypeVacuum,
- VolumeID: 3003,
- Server: "localhost:8081",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityLow,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.5",
- },
- CreatedAt: time.Now(),
- },
- }
-
- for _, task := range tasks {
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task %s: %v", task.ID, err)
- }
- t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
- }
-
- // Test task assignment to different workers
- t.Logf("Testing task assignments")
-
- // Vacuum worker should get vacuum tasks
- assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Errorf("Failed to request task for vacuum worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // EC worker should get EC tasks
- assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for EC worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // Multi-capability worker should be able to get any remaining task
- assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
- if err != nil {
- t.Errorf("Failed to request task for multi worker: %v", err)
- } else if assignedTask != nil {
- t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
-
- // Complete the task
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update progress: %v", err)
- }
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- }
-
- // Check final statistics
- stats := adminServer.GetSystemStats()
- t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
-
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d completed tasks", len(history))
-
- for _, entry := range history {
- t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v",
- entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration)
- }
-
- t.Logf("Minimal full workflow test completed successfully")
-}
diff --git a/weed/admin/task/operational_integration_test.go b/weed/admin/task/operational_integration_test.go
deleted file mode 100644
index e9966ef5b..000000000
--- a/weed/admin/task/operational_integration_test.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestOperationalIntegration tests the basic admin-worker operational flow
-func TestOperationalIntegration(t *testing.T) {
- t.Logf("Starting operational integration test")
-
- // Step 1: Create admin server with operational configuration
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- // Create a nil master client for testing (simplified)
- var masterClient *wdclient.MasterClient
-
- adminServer := NewAdminServer(config, masterClient)
-
- // Step 2: Start admin server
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 3: Create and register test workers
- worker1 := createTestWorker("worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
- worker2 := createTestWorker("worker-2", []types.TaskType{types.TaskTypeVacuum})
-
- err = adminServer.RegisterWorker(worker1)
- if err != nil {
- t.Fatalf("Failed to register worker1: %v", err)
- }
-
- err = adminServer.RegisterWorker(worker2)
- if err != nil {
- t.Fatalf("Failed to register worker2: %v", err)
- }
-
- // Step 4: Test basic task queueing
- t.Logf("Testing task queueing")
-
- // Create a simple test task
- testTask := &types.Task{
- ID: "test-vacuum-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- "server": "localhost:8080",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(testTask)
- if err != nil {
- t.Fatalf("Failed to queue test task: %v", err)
- }
- t.Logf("Successfully queued test vacuum task for volume %d", testTask.VolumeID)
-
- // Step 5: Test worker task request and assignment
- t.Logf("Testing worker task requests and assignment")
-
- // Worker requests task
- task, err := adminServer.RequestTask("worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Fatalf("Failed to request task from worker: %v", err)
- }
-
- if task == nil {
- t.Logf("No tasks available for assignment (this is expected in test environment)")
- } else {
- t.Logf("Successfully assigned task %s (%s) to worker-1", task.ID, task.Type)
-
- // Step 6: Simulate task progress updates
- t.Logf("Testing task progress updates")
-
- err = adminServer.UpdateTaskProgress(task.ID, 25.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(task.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(task.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- // Step 7: Test task completion
- t.Logf("Testing task completion")
-
- err = adminServer.CompleteTask(task.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
-
- t.Logf("Successfully completed task %s", task.ID)
- }
-
- // Step 8: Test metrics and statistics
- t.Logf("Testing system metrics")
-
- stats := adminServer.GetSystemStats()
- t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
-
- queuedCount := adminServer.GetQueuedTaskCount()
- activeCount := adminServer.GetActiveTaskCount()
- t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
-
- // Step 9: Test task history
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d entries", len(history))
-
- t.Logf("Operational integration test completed successfully")
-}
-
-func createTestWorker(id string, capabilities []types.TaskType) *types.Worker {
- return &types.Worker{
- ID: id,
- Address: fmt.Sprintf("localhost:900%s", id[len(id)-1:]),
- Capabilities: capabilities,
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-}
-
-// TestECTaskExecution tests the EC task validation (without actual execution)
-func TestECTaskExecution(t *testing.T) {
- t.Logf("Testing EC task validation")
-
- params := types.TaskParams{
- VolumeID: 1002,
- Server: "localhost:8080",
- Collection: "test",
- Parameters: map[string]interface{}{
- "volume_size": int64(32 * 1024 * 1024 * 1024),
- },
- }
-
- // Test that basic validation would work
- if params.VolumeID == 0 {
- t.Errorf("VolumeID should not be zero")
- }
- if params.Server == "" {
- t.Errorf("Server should not be empty")
- }
-
- t.Logf("EC task validation passed")
-}
-
-// TestVacuumTaskExecution tests the vacuum task validation (without actual execution)
-func TestVacuumTaskExecution(t *testing.T) {
- t.Logf("Testing vacuum task validation")
-
- params := types.TaskParams{
- VolumeID: 1001,
- Server: "localhost:8080",
- Collection: "test",
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- "volume_size": int64(25 * 1024 * 1024 * 1024),
- },
- }
-
- // Test that basic validation would work
- if params.VolumeID == 0 {
- t.Errorf("VolumeID should not be zero")
- }
- if params.Server == "" {
- t.Errorf("Server should not be empty")
- }
-
- t.Logf("Vacuum task validation passed")
-}
diff --git a/weed/admin/task/simple_integration_test.go b/weed/admin/task/simple_integration_test.go
deleted file mode 100644
index a7859e569..000000000
--- a/weed/admin/task/simple_integration_test.go
+++ /dev/null
@@ -1,233 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
-func TestSimpleIntegration(t *testing.T) {
- t.Logf("Starting simple integration test")
-
- // Step 1: Create a minimal admin server configuration
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- // Step 2: Create admin server with nil master client (for testing)
- adminServer := NewAdminServer(config, nil)
-
- // Step 3: Start admin server
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 4: Test worker registration
- t.Logf("Testing worker registration")
-
- worker := &types.Worker{
- ID: "test-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
- t.Logf("Successfully registered worker %s", worker.ID)
-
- // Step 5: Test task queueing
- t.Logf("Testing task queueing")
-
- task := &types.Task{
- ID: "test-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task: %v", err)
- }
- t.Logf("Successfully queued task %s", task.ID)
-
- // Step 6: Test task request by worker
- t.Logf("Testing task request")
-
- assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Fatalf("Failed to request task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
-
- // Step 7: Test task progress updates
- t.Logf("Testing task progress updates")
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- // Step 8: Test task completion
- t.Logf("Testing task completion")
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("Successfully completed task %s", assignedTask.ID)
- } else {
- t.Logf("No task was assigned (queue might be empty)")
- }
-
- // Step 9: Test basic metrics
- t.Logf("Testing basic metrics")
-
- stats := adminServer.GetSystemStats()
- if stats != nil {
- t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
- }
-
- queuedCount := adminServer.GetQueuedTaskCount()
- activeCount := adminServer.GetActiveTaskCount()
- t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
-
- // Step 10: Test task history
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d entries", len(history))
-
- t.Logf("Simple integration test completed successfully")
-}
-
-// TestWorkerHeartbeat tests worker heartbeat functionality
-func TestWorkerHeartbeat(t *testing.T) {
- t.Logf("Testing worker heartbeat")
-
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register a worker
- worker := &types.Worker{
- ID: "heartbeat-worker",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
-
- // Test heartbeat update
- status := &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
- if err != nil {
- t.Errorf("Failed to update worker heartbeat: %v", err)
- }
-
- t.Logf("Worker heartbeat test completed successfully")
-}
-
-// TestTaskQueueOperations tests task queue operations
-func TestTaskQueueOperations(t *testing.T) {
- t.Logf("Testing task queue operations")
-
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Test queuing multiple tasks
- for i := 0; i < 3; i++ {
- task := &types.Task{
- ID: fmt.Sprintf("queue-test-task-%d", i),
- Type: types.TaskTypeVacuum,
- VolumeID: uint32(2000 + i),
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Errorf("Failed to queue task %d: %v", i, err)
- }
- }
-
- // Check queue size
- queuedCount := adminServer.GetQueuedTaskCount()
- if queuedCount != 3 {
- t.Errorf("Expected 3 queued tasks, got %d", queuedCount)
- }
-
- t.Logf("Task queue operations test completed successfully")
-}
diff --git a/weed/admin/task/simulation.go b/weed/admin/task/simulation.go
deleted file mode 100644
index e30b326fc..000000000
--- a/weed/admin/task/simulation.go
+++ /dev/null
@@ -1,604 +0,0 @@
-package task
-
-import (
- "context"
- "fmt"
- "math/rand"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system
-type TaskSimulator struct {
- adminServer *AdminServer
- mockWorkers []*MockWorker
- mockMaster *MockMasterClient
- scenarios map[string]*SimulationScenario
- results map[string]*SimulationResult
- mutex sync.RWMutex
-}
-
-// SimulationScenario defines a test scenario
-type SimulationScenario struct {
- Name string
- Description string
- WorkerCount int
- VolumeCount int
- Duration time.Duration
- FailurePatterns []*FailurePattern
- TestCases []*TestCase
-}
-
-// FailurePattern defines how failures occur during simulation
-type FailurePattern struct {
- Type FailureType
- Probability float64 // 0.0 to 1.0
- Timing *TimingSpec // When during task execution
- Duration time.Duration
- Details string
-}
-
-// TestCase defines specific test scenarios
-type TestCase struct {
- Name string
- VolumeID uint32
- TaskType types.TaskType
- ExpectedOutcome string
- FailureToInject *FailurePattern
-}
-
-// FailureType represents different types of failures
-type FailureType string
-
-const (
- FailureWorkerTimeout FailureType = "worker_timeout"
- FailureTaskStuck FailureType = "task_stuck"
- FailureTaskCrash FailureType = "task_crash"
- FailureDuplicate FailureType = "duplicate_task"
- FailureResourceExhaust FailureType = "resource_exhaustion"
- FailureNetworkPartition FailureType = "network_partition"
-)
-
-// TimingSpec defines when a failure occurs
-type TimingSpec struct {
- MinProgress float64 // Minimum progress before failure can occur
- MaxProgress float64 // Maximum progress before failure must occur
- Delay time.Duration // Fixed delay before failure
-}
-
-// SimulationResult tracks the results of a simulation
-type SimulationResult struct {
- ScenarioName string
- StartTime time.Time
- EndTime time.Time
- Duration time.Duration
- TasksCreated int
- TasksCompleted int
- TasksFailed int
- TasksStuck int
- WorkerTimeouts int
- DuplicatesFound int
- StateInconsistencies int
- Errors []string
- Warnings []string
- Success bool
-}
-
-// MockWorker simulates a worker with controllable behavior
-type MockWorker struct {
- ID string
- Capabilities []types.TaskType
- MaxConcurrent int
- CurrentTasks map[string]*MockTask
- Status string
- FailureMode *FailurePattern
- mutex sync.Mutex
-}
-
-// MockTask represents a simulated task execution
-type MockTask struct {
- Task *types.Task
- StartTime time.Time
- Progress float64
- Stuck bool
- Failed bool
- Completed bool
-}
-
-// MockMasterClient simulates master server interactions
-type MockMasterClient struct {
- volumes map[uint32]*VolumeInfo
- inconsistency bool
- mutex sync.RWMutex
-}
-
-// NewTaskSimulator creates a new task simulator
-func NewTaskSimulator() *TaskSimulator {
- return &TaskSimulator{
- scenarios: make(map[string]*SimulationScenario),
- results: make(map[string]*SimulationResult),
- }
-}
-
-// RegisterScenario registers a simulation scenario
-func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) {
- ts.mutex.Lock()
- defer ts.mutex.Unlock()
-
- ts.scenarios[scenario.Name] = scenario
- glog.Infof("Registered simulation scenario: %s", scenario.Name)
-}
-
-// RunScenario executes a simulation scenario
-func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) {
- ts.mutex.RLock()
- scenario, exists := ts.scenarios[scenarioName]
- ts.mutex.RUnlock()
-
- if !exists {
- return nil, fmt.Errorf("scenario %s not found", scenarioName)
- }
-
- glog.Infof("Starting simulation scenario: %s", scenarioName)
-
- result := &SimulationResult{
- ScenarioName: scenarioName,
- StartTime: time.Now(),
- Errors: make([]string, 0),
- Warnings: make([]string, 0),
- }
-
- // Setup simulation environment
- if err := ts.setupEnvironment(scenario); err != nil {
- return nil, fmt.Errorf("failed to setup environment: %v", err)
- }
-
- // Execute test cases
- ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
- defer cancel()
-
- ts.executeScenario(ctx, scenario, result)
-
- // Cleanup
- ts.cleanup()
-
- result.EndTime = time.Now()
- result.Duration = result.EndTime.Sub(result.StartTime)
- result.Success = len(result.Errors) == 0
-
- ts.mutex.Lock()
- ts.results[scenarioName] = result
- ts.mutex.Unlock()
-
- glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success)
- return result, nil
-}
-
-// setupEnvironment prepares the simulation environment
-func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error {
- // Create mock master client
- ts.mockMaster = &MockMasterClient{
- volumes: make(map[uint32]*VolumeInfo),
- }
-
- // Generate mock volumes
- for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ {
- volume := &VolumeInfo{
- ID: i,
- Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB
- Collection: fmt.Sprintf("collection_%d", (i%3)+1),
- DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage
- ReadOnly: false,
- Server: fmt.Sprintf("server_%d", (i%6)+1),
- ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(),
- }
- ts.mockMaster.volumes[i] = volume
- }
-
- // Create mock workers
- ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount)
- for i := 0; i < scenario.WorkerCount; i++ {
- worker := &MockWorker{
- ID: fmt.Sprintf("worker_%d", i+1),
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
- MaxConcurrent: 2,
- CurrentTasks: make(map[string]*MockTask),
- Status: "active",
- }
-
- // Apply failure patterns
- if i < len(scenario.FailurePatterns) {
- worker.FailureMode = scenario.FailurePatterns[i]
- }
-
- ts.mockWorkers[i] = worker
- }
-
- // Initialize admin server (simplified for simulation)
- config := DefaultAdminConfig()
- config.ScanInterval = 10 * time.Second
- config.TaskTimeout = 30 * time.Second
-
- // Note: In a real implementation, this would use the actual master client
- // For simulation, we'd need to inject our mock
-
- return nil
-}
-
-// executeScenario runs the actual simulation scenario
-func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
- // Execute each test case
- for _, testCase := range scenario.TestCases {
- ts.executeTestCase(ctx, testCase, result)
- }
-
- // Run continuous simulation for remaining duration
- ts.runContinuousSimulation(ctx, scenario, result)
-}
-
-// executeTestCase runs a specific test case
-func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) {
- glog.V(1).Infof("Executing test case: %s", testCase.Name)
-
- // Create task for the test case
- task := &types.Task{
- ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()),
- Type: testCase.TaskType,
- VolumeID: testCase.VolumeID,
- Priority: types.TaskPriorityNormal,
- CreatedAt: time.Now(),
- }
-
- result.TasksCreated++
-
- // Assign to worker
- worker := ts.selectWorkerForTask(task)
- if worker == nil {
- result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name))
- return
- }
-
- // Execute task with potential failure injection
- ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result)
-}
-
-// runContinuousSimulation runs ongoing simulation
-func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
- ticker := time.NewTicker(5 * time.Second)
- defer ticker.Stop()
-
- for {
- select {
- case <-ctx.Done():
- return
- case <-ticker.C:
- ts.simulateOngoingTasks(result)
- ts.checkForInconsistencies(result)
- }
- }
-}
-
-// executeTaskOnWorker simulates task execution on a worker
-func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
- worker.mutex.Lock()
- defer worker.mutex.Unlock()
-
- mockTask := &MockTask{
- Task: task,
- StartTime: time.Now(),
- Progress: 0.0,
- }
-
- worker.CurrentTasks[task.ID] = mockTask
-
- // Simulate task execution
- go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result)
-}
-
-// simulateTaskExecution simulates the execution of a single task
-func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
- defer func() {
- worker.mutex.Lock()
- delete(worker.CurrentTasks, mockTask.Task.ID)
- worker.mutex.Unlock()
- }()
-
- duration := 20 * time.Second // Base task duration
- progressTicker := time.NewTicker(time.Second)
- defer progressTicker.Stop()
-
- startTime := time.Now()
-
- for {
- select {
- case <-ctx.Done():
- return
- case <-progressTicker.C:
- elapsed := time.Since(startTime)
- progress := float64(elapsed) / float64(duration) * 100.0
-
- if progress >= 100.0 {
- mockTask.Completed = true
- result.TasksCompleted++
- glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID)
- return
- }
-
- mockTask.Progress = progress
-
- // Check for failure injection
- if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) {
- ts.injectFailure(mockTask, worker, failurePattern, result)
- return
- }
-
- // Check for worker failure mode
- if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) {
- ts.injectFailure(mockTask, worker, worker.FailureMode, result)
- return
- }
- }
- }
-}
-
-// shouldInjectFailure determines if a failure should be injected
-func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool {
- if pattern.Timing != nil {
- if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress {
- return false
- }
- if elapsed < pattern.Timing.Delay {
- return false
- }
- }
-
- return rand.Float64() < pattern.Probability
-}
-
-// injectFailure simulates a failure
-func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) {
- glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID)
-
- switch pattern.Type {
- case FailureWorkerTimeout:
- worker.Status = "timeout"
- result.WorkerTimeouts++
-
- case FailureTaskStuck:
- mockTask.Stuck = true
- result.TasksStuck++
-
- case FailureTaskCrash:
- mockTask.Failed = true
- result.TasksFailed++
-
- case FailureDuplicate:
- result.DuplicatesFound++
-
- case FailureResourceExhaust:
- worker.Status = "resource_exhausted"
- result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID))
-
- case FailureNetworkPartition:
- worker.Status = "partitioned"
- result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID))
- }
-}
-
-// selectWorkerForTask selects an available worker for a task
-func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker {
- for _, worker := range ts.mockWorkers {
- if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent {
- // Check capabilities
- for _, capability := range worker.Capabilities {
- if capability == task.Type {
- return worker
- }
- }
- }
- }
- return nil
-}
-
-// simulateOngoingTasks handles ongoing task simulation
-func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) {
- // Create random new tasks
- if rand.Float64() < 0.3 { // 30% chance to create new task every tick
- taskType := types.TaskTypeVacuum
- if rand.Float64() < 0.5 {
- taskType = types.TaskTypeErasureCoding
- }
-
- task := &types.Task{
- ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()),
- Type: taskType,
- VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1),
- Priority: types.TaskPriorityNormal,
- CreatedAt: time.Now(),
- }
-
- result.TasksCreated++
-
- worker := ts.selectWorkerForTask(task)
- if worker != nil {
- ts.executeTaskOnWorker(context.Background(), task, worker, nil, result)
- }
- }
-}
-
-// checkForInconsistencies checks for state inconsistencies
-func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) {
- // Check for volume reservation inconsistencies
- // Check for duplicate tasks
- // Check for orphaned tasks
- // This would be more comprehensive in a real implementation
-
- for _, worker := range ts.mockWorkers {
- worker.mutex.Lock()
- for taskID, mockTask := range worker.CurrentTasks {
- if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second {
- result.StateInconsistencies++
- result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID))
- }
- }
- worker.mutex.Unlock()
- }
-}
-
-// cleanup cleans up simulation resources
-func (ts *TaskSimulator) cleanup() {
- ts.mockWorkers = nil
- ts.mockMaster = nil
-}
-
-// GetSimulationResults returns all simulation results
-func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult {
- ts.mutex.RLock()
- defer ts.mutex.RUnlock()
-
- results := make(map[string]*SimulationResult)
- for k, v := range ts.results {
- results[k] = v
- }
- return results
-}
-
-// CreateStandardScenarios creates a set of standard test scenarios
-func (ts *TaskSimulator) CreateStandardScenarios() {
- // Scenario 1: Worker Timeout During EC
- ts.RegisterScenario(&SimulationScenario{
- Name: "worker_timeout_during_ec",
- Description: "Test worker timeout during erasure coding operation",
- WorkerCount: 3,
- VolumeCount: 10,
- Duration: 2 * time.Minute,
- FailurePatterns: []*FailurePattern{
- {
- Type: FailureWorkerTimeout,
- Probability: 1.0,
- Timing: &TimingSpec{
- MinProgress: 50.0,
- MaxProgress: 60.0,
- },
- },
- },
- TestCases: []*TestCase{
- {
- Name: "ec_timeout_test",
- VolumeID: 1,
- TaskType: types.TaskTypeErasureCoding,
- ExpectedOutcome: "task_reassigned",
- },
- },
- })
-
- // Scenario 2: Stuck Vacuum Task
- ts.RegisterScenario(&SimulationScenario{
- Name: "stuck_vacuum_task",
- Description: "Test stuck vacuum task detection and cleanup",
- WorkerCount: 2,
- VolumeCount: 5,
- Duration: 90 * time.Second,
- TestCases: []*TestCase{
- {
- Name: "vacuum_stuck_test",
- VolumeID: 2,
- TaskType: types.TaskTypeVacuum,
- FailureToInject: &FailurePattern{
- Type: FailureTaskStuck,
- Probability: 1.0,
- Timing: &TimingSpec{
- MinProgress: 75.0,
- MaxProgress: 80.0,
- },
- },
- ExpectedOutcome: "task_timeout_detected",
- },
- },
- })
-
- // Scenario 3: Duplicate Task Prevention
- ts.RegisterScenario(&SimulationScenario{
- Name: "duplicate_task_prevention",
- Description: "Test duplicate task detection and prevention",
- WorkerCount: 4,
- VolumeCount: 8,
- Duration: 60 * time.Second,
- TestCases: []*TestCase{
- {
- Name: "duplicate_ec_test_1",
- VolumeID: 3,
- TaskType: types.TaskTypeErasureCoding,
- },
- {
- Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate
- VolumeID: 3,
- TaskType: types.TaskTypeErasureCoding,
- FailureToInject: &FailurePattern{
- Type: FailureDuplicate,
- Probability: 1.0,
- },
- ExpectedOutcome: "duplicate_detected",
- },
- },
- })
-
- // Scenario 4: Master-Admin State Divergence
- ts.RegisterScenario(&SimulationScenario{
- Name: "master_admin_divergence",
- Description: "Test state reconciliation between master and admin server",
- WorkerCount: 3,
- VolumeCount: 15,
- Duration: 2 * time.Minute,
- TestCases: []*TestCase{
- {
- Name: "state_reconciliation_test",
- VolumeID: 4,
- TaskType: types.TaskTypeErasureCoding,
- ExpectedOutcome: "state_reconciled",
- },
- },
- })
-}
-
-// GenerateSimulationReport creates a comprehensive report of simulation results
-func (ts *TaskSimulator) GenerateSimulationReport() string {
- ts.mutex.RLock()
- defer ts.mutex.RUnlock()
-
- report := "# Task Distribution System Simulation Report\n\n"
-
- for scenarioName, result := range ts.results {
- report += fmt.Sprintf("## Scenario: %s\n", scenarioName)
- report += fmt.Sprintf("- **Duration**: %v\n", result.Duration)
- report += fmt.Sprintf("- **Success**: %v\n", result.Success)
- report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated)
- report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted)
- report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed)
- report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck)
- report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts)
- report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound)
- report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies)
-
- if len(result.Errors) > 0 {
- report += "- **Errors**:\n"
- for _, err := range result.Errors {
- report += fmt.Sprintf(" - %s\n", err)
- }
- }
-
- if len(result.Warnings) > 0 {
- report += "- **Warnings**:\n"
- for _, warning := range result.Warnings {
- report += fmt.Sprintf(" - %s\n", warning)
- }
- }
-
- report += "\n"
- }
-
- return report
-}
diff --git a/weed/admin/task/simulation/comprehensive_simulation.go b/weed/admin/task/simulation/comprehensive_simulation.go
deleted file mode 100644
index 127c201d6..000000000
--- a/weed/admin/task/simulation/comprehensive_simulation.go
+++ /dev/null
@@ -1,695 +0,0 @@
-package simulation
-
-import (
- "context"
- "fmt"
- "math/rand"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/admin/task"
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// ComprehensiveSimulator tests all possible edge cases in volume/shard state management
-type ComprehensiveSimulator struct {
- stateManager *task.VolumeStateManager
- mockMaster *MockMasterServer
- mockWorkers []*MockWorker
- scenarios []*StateTestScenario
- currentScenario *StateTestScenario
- results *SimulationResults
- eventLog []*SimulationEvent
- mutex sync.RWMutex
-}
-
-// StateTestScenario represents a specific state management test case
-type StateTestScenario struct {
- Name string
- Description string
- InitialState *ClusterState
- EventSequence []*SimulationEvent
- ExpectedFinalState *ClusterState
- InconsistencyChecks []*InconsistencyCheck
- Duration time.Duration
-}
-
-// ClusterState represents the complete state of the cluster
-type ClusterState struct {
- Volumes map[uint32]*task.VolumeInfo
- ECShards map[uint32]map[int]*task.ShardInfo
- ServerCapacity map[string]*task.CapacityInfo
- InProgressTasks map[string]*task.TaskImpact
- Timestamp time.Time
-}
-
-// SimulationEvent represents an event that can occur during simulation
-type SimulationEvent struct {
- Type EventType
- Timestamp time.Time
- VolumeID uint32
- ShardID *int
- Server string
- TaskID string
- Parameters map[string]interface{}
- Description string
-}
-
-// EventType represents different types of simulation events
-type EventType string
-
-const (
- // Volume events
- EventVolumeCreated EventType = "volume_created"
- EventVolumeDeleted EventType = "volume_deleted"
- EventVolumeSizeChanged EventType = "volume_size_changed"
- EventVolumeReadOnly EventType = "volume_readonly"
-
- // Shard events
- EventShardCreated EventType = "shard_created"
- EventShardDeleted EventType = "shard_deleted"
- EventShardMoved EventType = "shard_moved"
- EventShardCorrupted EventType = "shard_corrupted"
-
- // Task events
- EventTaskStarted EventType = "task_started"
- EventTaskCompleted EventType = "task_completed"
- EventTaskFailed EventType = "task_failed"
- EventTaskStuck EventType = "task_stuck"
- EventTaskCancelled EventType = "task_cancelled"
-
- // Worker events
- EventWorkerJoined EventType = "worker_joined"
- EventWorkerLeft EventType = "worker_left"
- EventWorkerTimeout EventType = "worker_timeout"
- EventWorkerRestarted EventType = "worker_restarted"
-
- // Master events
- EventMasterSync EventType = "master_sync"
- EventMasterInconsistent EventType = "master_inconsistent"
- EventMasterPartitioned EventType = "master_partitioned"
- EventMasterReconnected EventType = "master_reconnected"
-
- // Network events
- EventNetworkPartition EventType = "network_partition"
- EventNetworkHealed EventType = "network_healed"
- EventMessageDelayed EventType = "message_delayed"
- EventMessageLost EventType = "message_lost"
-)
-
-// InconsistencyCheck defines what inconsistencies to check for
-type InconsistencyCheck struct {
- Name string
- Type task.InconsistencyType
- ExpectedCount int
- MaxAllowedCount int
- SeverityThreshold task.SeverityLevel
-}
-
-// MockMasterServer simulates master server behavior with controllable inconsistencies
-type MockMasterServer struct {
- volumes map[uint32]*task.VolumeInfo
- ecShards map[uint32]map[int]*task.ShardInfo
- serverCapacity map[string]*task.CapacityInfo
- inconsistencyMode bool
- networkPartitioned bool
- responseDelay time.Duration
- mutex sync.RWMutex
-}
-
-// MockWorker represents a mock worker for testing
-type MockWorker struct {
- ID string
- Capabilities []types.TaskType
- IsActive bool
- TaskDelay time.Duration
- FailureRate float64
-}
-
-// SimulationResults tracks comprehensive simulation results
-type SimulationResults struct {
- ScenarioName string
- StartTime time.Time
- EndTime time.Time
- Duration time.Duration
- TotalEvents int
- EventsByType map[EventType]int
- InconsistenciesFound map[task.InconsistencyType]int
- TasksExecuted int
- TasksSucceeded int
- TasksFailed int
- StateValidationsPassed int
- StateValidationsFailed int
- CriticalErrors []string
- Warnings []string
- DetailedLog []string
- Success bool
-}
-
-// NewComprehensiveSimulator creates a new comprehensive simulator
-func NewComprehensiveSimulator() *ComprehensiveSimulator {
- return &ComprehensiveSimulator{
- stateManager: task.NewVolumeStateManager(nil),
- mockMaster: NewMockMasterServer(),
- scenarios: []*StateTestScenario{},
- eventLog: []*SimulationEvent{},
- results: &SimulationResults{
- EventsByType: make(map[EventType]int),
- InconsistenciesFound: make(map[task.InconsistencyType]int),
- CriticalErrors: []string{},
- Warnings: []string{},
- DetailedLog: []string{},
- },
- }
-}
-
-// CreateComprehensiveScenarios creates all possible edge case scenarios
-func (cs *ComprehensiveSimulator) CreateComprehensiveScenarios() {
- cs.scenarios = []*StateTestScenario{
- cs.createVolumeCreationDuringTaskScenario(),
- cs.createVolumeDeletionDuringTaskScenario(),
- cs.createShardCreationRaceConditionScenario(),
- cs.createMasterSyncDuringTaskScenario(),
- cs.createNetworkPartitionScenario(),
- cs.createWorkerFailureDuringECScenario(),
- cs.createConcurrentTasksScenario(),
- cs.createCapacityOverflowScenario(),
- cs.createShardCorruptionScenario(),
- cs.createMasterInconsistencyScenario(),
- cs.createTaskOrphanScenario(),
- cs.createDuplicateTaskDetectionScenario(),
- cs.createVolumeStateRollbackScenario(),
- cs.createComplexECOperationScenario(),
- cs.createHighLoadStressTestScenario(),
- }
-
- glog.Infof("Created %d comprehensive test scenarios", len(cs.scenarios))
-}
-
-// RunAllComprehensiveScenarios runs all edge case scenarios
-func (cs *ComprehensiveSimulator) RunAllComprehensiveScenarios() (*SimulationResults, error) {
- glog.Infof("Starting comprehensive state management simulation")
-
- cs.results.StartTime = time.Now()
-
- for _, scenario := range cs.scenarios {
- glog.Infof("Running scenario: %s", scenario.Name)
-
- if err := cs.RunScenario(scenario); err != nil {
- cs.results.CriticalErrors = append(cs.results.CriticalErrors,
- fmt.Sprintf("Scenario %s failed: %v", scenario.Name, err))
- }
-
- // Brief pause between scenarios
- time.Sleep(1 * time.Second)
- }
-
- cs.results.EndTime = time.Now()
- cs.results.Duration = cs.results.EndTime.Sub(cs.results.StartTime)
- cs.results.Success = len(cs.results.CriticalErrors) == 0
-
- cs.generateDetailedReport()
-
- glog.Infof("Comprehensive simulation completed: %v", cs.results.Success)
- return cs.results, nil
-}
-
-// Scenario creation methods
-
-func (cs *ComprehensiveSimulator) createVolumeCreationDuringTaskScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "volume_creation_during_task",
- Description: "Tests state consistency when master reports new volume while task is creating it",
- InitialState: &ClusterState{
- Volumes: make(map[uint32]*task.VolumeInfo),
- ECShards: make(map[uint32]map[int]*task.ShardInfo),
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}},
- {Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "create_task_1"},
- },
- ExpectedFinalState: &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 1024 * 1024 * 1024},
- },
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
- },
- Duration: 30 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createVolumeDeletionDuringTaskScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "volume_deletion_during_task",
- Description: "Tests handling when volume is deleted while task is working on it",
- InitialState: &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 1024 * 1024 * 1024},
- },
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
- {Type: EventVolumeDeleted, VolumeID: 1},
- {Type: EventMasterSync},
- {Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1},
- },
- Duration: 30 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createShardCreationRaceConditionScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "shard_creation_race_condition",
- Description: "Tests race condition between EC task creating shards and master sync",
- InitialState: &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
- },
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
- // Simulate shards being created one by one
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
- {Type: EventMasterSync}, // Master sync happens while shards are being created
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"},
- {Type: EventTaskCompleted, TaskID: "ec_task_1"},
- {Type: EventMasterSync},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0},
- },
- Duration: 45 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createNetworkPartitionScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "network_partition_recovery",
- Description: "Tests state consistency during and after network partitions",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"},
- {Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "30s"}},
- {Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
- {Type: EventNetworkHealed},
- {Type: EventMasterReconnected},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "partition_task_1"},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1},
- },
- Duration: 60 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createConcurrentTasksScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "concurrent_tasks_capacity_tracking",
- Description: "Tests capacity tracking with multiple concurrent tasks",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"},
- {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"},
- {Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
- {Type: EventTaskCompleted, TaskID: "ec_task_1"},
- {Type: EventTaskCompleted, TaskID: "ec_task_2"},
- {Type: EventMasterSync},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
- },
- Duration: 90 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createComplexECOperationScenario() *StateTestScenario {
- return &StateTestScenario{
- Name: "complex_ec_operation",
- Description: "Tests complex EC operations with shard movements and rebuilds",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"},
- // Create all 14 shards
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
- // ... more shards
- {Type: EventTaskCompleted, TaskID: "ec_encode_1"},
- {Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)},
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
- {Type: EventTaskCompleted, TaskID: "ec_rebuild_1"},
- {Type: EventMasterSync},
- },
- Duration: 120 * time.Second,
- }
-}
-
-func (cs *ComprehensiveSimulator) createHighLoadStressTestScenario() *StateTestScenario {
- events := []*SimulationEvent{}
-
- // Create 100 concurrent tasks
- for i := 0; i < 100; i++ {
- events = append(events, &SimulationEvent{
- Type: EventTaskStarted,
- VolumeID: uint32(i + 1),
- TaskID: fmt.Sprintf("stress_task_%d", i),
- })
- }
-
- // Add master syncs throughout
- for i := 0; i < 10; i++ {
- events = append(events, &SimulationEvent{
- Type: EventMasterSync,
- })
- }
-
- // Complete all tasks
- for i := 0; i < 100; i++ {
- events = append(events, &SimulationEvent{
- Type: EventTaskCompleted,
- TaskID: fmt.Sprintf("stress_task_%d", i),
- })
- }
-
- return &StateTestScenario{
- Name: "high_load_stress_test",
- Description: "Tests system under high load with many concurrent operations",
- EventSequence: events,
- Duration: 5 * time.Minute,
- }
-}
-
-// Add more scenario creation methods...
-func (cs *ComprehensiveSimulator) createMasterSyncDuringTaskScenario() *StateTestScenario {
- return &StateTestScenario{Name: "master_sync_during_task", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createWorkerFailureDuringECScenario() *StateTestScenario {
- return &StateTestScenario{Name: "worker_failure_during_ec", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createCapacityOverflowScenario() *StateTestScenario {
- return &StateTestScenario{Name: "capacity_overflow", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createShardCorruptionScenario() *StateTestScenario {
- return &StateTestScenario{Name: "shard_corruption", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createMasterInconsistencyScenario() *StateTestScenario {
- return &StateTestScenario{Name: "master_inconsistency", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createTaskOrphanScenario() *StateTestScenario {
- return &StateTestScenario{Name: "task_orphan", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createDuplicateTaskDetectionScenario() *StateTestScenario {
- return &StateTestScenario{Name: "duplicate_task_detection", Description: "Test", Duration: 30 * time.Second}
-}
-
-func (cs *ComprehensiveSimulator) createVolumeStateRollbackScenario() *StateTestScenario {
- return &StateTestScenario{Name: "volume_state_rollback", Description: "Test", Duration: 30 * time.Second}
-}
-
-// RunScenario executes a single test scenario
-func (cs *ComprehensiveSimulator) RunScenario(scenario *StateTestScenario) error {
- cs.mutex.Lock()
- cs.currentScenario = scenario
- cs.mutex.Unlock()
-
- glog.V(1).Infof("Setting up scenario: %s", scenario.Name)
-
- // Setup initial state
- if err := cs.setupInitialState(scenario.InitialState); err != nil {
- return fmt.Errorf("failed to setup initial state: %v", err)
- }
-
- // Execute event sequence
- ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
- defer cancel()
-
- for _, event := range scenario.EventSequence {
- select {
- case <-ctx.Done():
- return fmt.Errorf("scenario timed out")
- default:
- if err := cs.executeEvent(event); err != nil {
- cs.results.Warnings = append(cs.results.Warnings,
- fmt.Sprintf("Event execution warning in %s: %v", scenario.Name, err))
- }
- cs.logEvent(event)
- }
-
- // Small delay between events
- time.Sleep(100 * time.Millisecond)
- }
-
- // Validate final state
- if err := cs.validateFinalState(scenario); err != nil {
- cs.results.StateValidationsFailed++
- return fmt.Errorf("final state validation failed: %v", err)
- } else {
- cs.results.StateValidationsPassed++
- }
-
- glog.V(1).Infof("Scenario %s completed successfully", scenario.Name)
- return nil
-}
-
-// executeEvent executes a single simulation event
-func (cs *ComprehensiveSimulator) executeEvent(event *SimulationEvent) error {
- cs.results.TotalEvents++
- cs.results.EventsByType[event.Type]++
-
- switch event.Type {
- case EventTaskStarted:
- return cs.simulateTaskStart(event)
- case EventTaskCompleted:
- return cs.simulateTaskCompletion(event)
- case EventVolumeCreated:
- return cs.simulateVolumeCreation(event)
- case EventVolumeDeleted:
- return cs.simulateVolumeDeletion(event)
- case EventShardCreated:
- return cs.simulateShardCreation(event)
- case EventMasterSync:
- return cs.simulateMasterSync(event)
- case EventNetworkPartition:
- return cs.simulateNetworkPartition(event)
- default:
- return nil // Unsupported event type
- }
-}
-
-// Event simulation methods
-func (cs *ComprehensiveSimulator) simulateTaskStart(event *SimulationEvent) error {
- taskType, _ := event.Parameters["type"].(string)
-
- impact := &task.TaskImpact{
- TaskID: event.TaskID,
- TaskType: types.TaskType(taskType),
- VolumeID: event.VolumeID,
- StartedAt: time.Now(),
- EstimatedEnd: time.Now().Add(30 * time.Second),
- VolumeChanges: &task.VolumeChanges{},
- ShardChanges: make(map[int]*task.ShardChange),
- CapacityDelta: make(map[string]int64),
- }
-
- cs.stateManager.RegisterTaskImpact(event.TaskID, impact)
- cs.results.TasksExecuted++
-
- return nil
-}
-
-func (cs *ComprehensiveSimulator) simulateTaskCompletion(event *SimulationEvent) error {
- cs.stateManager.UnregisterTaskImpact(event.TaskID)
- cs.results.TasksSucceeded++
- return nil
-}
-
-func (cs *ComprehensiveSimulator) simulateVolumeCreation(event *SimulationEvent) error {
- size, _ := event.Parameters["size"].(int64)
- cs.mockMaster.CreateVolume(event.VolumeID, size)
- return nil
-}
-
-func (cs *ComprehensiveSimulator) simulateVolumeDeletion(event *SimulationEvent) error {
- cs.mockMaster.DeleteVolume(event.VolumeID)
- return nil
-}
-
-func (cs *ComprehensiveSimulator) simulateShardCreation(event *SimulationEvent) error {
- if event.ShardID != nil {
- cs.mockMaster.CreateShard(event.VolumeID, *event.ShardID, event.Server)
- }
- return nil
-}
-
-func (cs *ComprehensiveSimulator) simulateMasterSync(event *SimulationEvent) error {
- return cs.stateManager.SyncWithMaster()
-}
-
-func (cs *ComprehensiveSimulator) simulateNetworkPartition(event *SimulationEvent) error {
- cs.mockMaster.SetNetworkPartitioned(true)
-
- // Auto-heal after duration
- if durationStr, ok := event.Parameters["duration"].(string); ok {
- if duration, err := time.ParseDuration(durationStr); err == nil {
- time.AfterFunc(duration, func() {
- cs.mockMaster.SetNetworkPartitioned(false)
- })
- }
- }
-
- return nil
-}
-
-// Helper methods
-func (cs *ComprehensiveSimulator) setupInitialState(initialState *ClusterState) error {
- if initialState == nil {
- return nil
- }
-
- // Setup mock master with initial state
- for volumeID, volume := range initialState.Volumes {
- cs.mockMaster.CreateVolume(volumeID, int64(volume.Size))
- }
-
- for volumeID, shards := range initialState.ECShards {
- for shardID, shard := range shards {
- cs.mockMaster.CreateShard(volumeID, shardID, shard.Server)
- }
- }
-
- return nil
-}
-
-func (cs *ComprehensiveSimulator) validateFinalState(scenario *StateTestScenario) error {
- // Run inconsistency checks
- for _, check := range scenario.InconsistencyChecks {
- if err := cs.validateInconsistencyCheck(check); err != nil {
- return err
- }
- }
-
- return nil
-}
-
-func (cs *ComprehensiveSimulator) validateInconsistencyCheck(check *InconsistencyCheck) error {
- // This would check for specific inconsistencies
- // For now, we'll simulate the check
- found := rand.Intn(check.MaxAllowedCount + 1)
-
- if found > check.MaxAllowedCount {
- return fmt.Errorf("inconsistency check %s failed: found %d, max allowed %d",
- check.Name, found, check.MaxAllowedCount)
- }
-
- cs.results.InconsistenciesFound[check.Type] += found
- return nil
-}
-
-func (cs *ComprehensiveSimulator) logEvent(event *SimulationEvent) {
- cs.mutex.Lock()
- defer cs.mutex.Unlock()
-
- cs.eventLog = append(cs.eventLog, event)
- logMsg := fmt.Sprintf("Event: %s, Volume: %d, Task: %s", event.Type, event.VolumeID, event.TaskID)
- cs.results.DetailedLog = append(cs.results.DetailedLog, logMsg)
-}
-
-func (cs *ComprehensiveSimulator) generateDetailedReport() {
- glog.Infof("=== COMPREHENSIVE SIMULATION REPORT ===")
- glog.Infof("Duration: %v", cs.results.Duration)
- glog.Infof("Total Events: %d", cs.results.TotalEvents)
- glog.Infof("Tasks Executed: %d", cs.results.TasksExecuted)
- glog.Infof("Tasks Succeeded: %d", cs.results.TasksSucceeded)
- glog.Infof("State Validations Passed: %d", cs.results.StateValidationsPassed)
- glog.Infof("State Validations Failed: %d", cs.results.StateValidationsFailed)
-
- glog.Infof("Events by Type:")
- for eventType, count := range cs.results.EventsByType {
- glog.Infof(" %s: %d", eventType, count)
- }
-
- glog.Infof("Inconsistencies Found:")
- for incType, count := range cs.results.InconsistenciesFound {
- glog.Infof(" %s: %d", incType, count)
- }
-
- if len(cs.results.CriticalErrors) > 0 {
- glog.Errorf("Critical Errors:")
- for _, err := range cs.results.CriticalErrors {
- glog.Errorf(" %s", err)
- }
- }
-
- glog.Infof("Overall Success: %v", cs.results.Success)
- glog.Infof("========================================")
-}
-
-// Mock Master Server implementation
-func NewMockMasterServer() *MockMasterServer {
- return &MockMasterServer{
- volumes: make(map[uint32]*task.VolumeInfo),
- ecShards: make(map[uint32]map[int]*task.ShardInfo),
- serverCapacity: make(map[string]*task.CapacityInfo),
- }
-}
-
-func (mms *MockMasterServer) CreateVolume(volumeID uint32, size int64) {
- mms.mutex.Lock()
- defer mms.mutex.Unlock()
-
- mms.volumes[volumeID] = &task.VolumeInfo{
- ID: volumeID,
- Size: uint64(size),
- }
-}
-
-func (mms *MockMasterServer) DeleteVolume(volumeID uint32) {
- mms.mutex.Lock()
- defer mms.mutex.Unlock()
-
- delete(mms.volumes, volumeID)
- delete(mms.ecShards, volumeID)
-}
-
-func (mms *MockMasterServer) CreateShard(volumeID uint32, shardID int, server string) {
- mms.mutex.Lock()
- defer mms.mutex.Unlock()
-
- if mms.ecShards[volumeID] == nil {
- mms.ecShards[volumeID] = make(map[int]*task.ShardInfo)
- }
-
- mms.ecShards[volumeID][shardID] = &task.ShardInfo{
- ShardID: shardID,
- Server: server,
- Status: task.ShardStatusExists,
- }
-}
-
-func (mms *MockMasterServer) SetNetworkPartitioned(partitioned bool) {
- mms.mutex.Lock()
- defer mms.mutex.Unlock()
-
- mms.networkPartitioned = partitioned
-}
-
-// Helper function
-func intPtr(i int) *int {
- return &i
-}
diff --git a/weed/admin/task/simulation/comprehensive_simulation_test.go b/weed/admin/task/simulation/comprehensive_simulation_test.go
deleted file mode 100644
index 9cdbba006..000000000
--- a/weed/admin/task/simulation/comprehensive_simulation_test.go
+++ /dev/null
@@ -1,444 +0,0 @@
-package simulation
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/admin/task"
-)
-
-func TestComprehensiveSimulation_VolumeCreationDuringTask(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "volume_creation_during_task",
- Description: "Tests state consistency when master reports new volume while task is creating it",
- InitialState: &ClusterState{
- Volumes: make(map[uint32]*task.VolumeInfo),
- ECShards: make(map[uint32]map[int]*task.ShardInfo),
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}},
- {Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "create_task_1"},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
- },
- Duration: 30 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Volume creation during task scenario failed: %v", err)
- }
-
- t.Log("✅ Volume creation during task test passed")
-}
-
-func TestComprehensiveSimulation_VolumeDeletionDuringTask(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "volume_deletion_during_task",
- Description: "Tests handling when volume is deleted while task is working on it",
- InitialState: &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 1024 * 1024 * 1024},
- },
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
- {Type: EventVolumeDeleted, VolumeID: 1},
- {Type: EventMasterSync},
- {Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1, MaxAllowedCount: 1},
- },
- Duration: 30 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Volume deletion during task scenario failed: %v", err)
- }
-
- t.Log("✅ Volume deletion during task test passed")
-}
-
-func TestComprehensiveSimulation_ShardCreationRaceCondition(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "shard_creation_race_condition",
- Description: "Tests race condition between EC task creating shards and master sync",
- InitialState: &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
- },
- },
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
- // Simulate shards being created one by one
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
- {Type: EventMasterSync}, // Master sync happens while shards are being created
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"},
- {Type: EventTaskCompleted, TaskID: "ec_task_1"},
- {Type: EventMasterSync},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0},
- },
- Duration: 45 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Shard creation race condition scenario failed: %v", err)
- }
-
- t.Log("✅ Shard creation race condition test passed")
-}
-
-func TestComprehensiveSimulation_NetworkPartitionRecovery(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "network_partition_recovery",
- Description: "Tests state consistency during and after network partitions",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"},
- {Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "5s"}}, // Shorter for test
- {Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
- {Type: EventNetworkHealed},
- {Type: EventMasterReconnected},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "partition_task_1"},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1},
- },
- Duration: 30 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Network partition recovery scenario failed: %v", err)
- }
-
- t.Log("✅ Network partition recovery test passed")
-}
-
-func TestComprehensiveSimulation_ConcurrentTasksCapacityTracking(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "concurrent_tasks_capacity_tracking",
- Description: "Tests capacity tracking with multiple concurrent tasks",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"},
- {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"},
- {Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
- {Type: EventTaskCompleted, TaskID: "ec_task_1"},
- {Type: EventTaskCompleted, TaskID: "ec_task_2"},
- {Type: EventMasterSync},
- },
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
- },
- Duration: 60 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Concurrent tasks capacity tracking scenario failed: %v", err)
- }
-
- t.Log("✅ Concurrent tasks capacity tracking test passed")
-}
-
-func TestComprehensiveSimulation_ComplexECOperation(t *testing.T) {
- simulator := NewComprehensiveSimulator()
-
- scenario := &StateTestScenario{
- Name: "complex_ec_operation",
- Description: "Tests complex EC operations with shard movements and rebuilds",
- EventSequence: []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"},
- // Create some shards
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
- {Type: EventTaskCompleted, TaskID: "ec_encode_1"},
- {Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)},
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
- {Type: EventTaskCompleted, TaskID: "ec_rebuild_1"},
- {Type: EventMasterSync},
- },
- Duration: 60 * time.Second,
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Complex EC operation scenario failed: %v", err)
- }
-
- t.Log("✅ Complex EC operation test passed")
-}
-
-func TestComprehensiveSimulation_HighLoadStressTest(t *testing.T) {
- if testing.Short() {
- t.Skip("Skipping high load stress test in short mode")
- }
-
- simulator := NewComprehensiveSimulator()
-
- events := []*SimulationEvent{}
-
- // Create 50 concurrent tasks (reduced from 100 for faster test)
- for i := 0; i < 50; i++ {
- events = append(events, &SimulationEvent{
- Type: EventTaskStarted,
- VolumeID: uint32(i + 1),
- TaskID: fmt.Sprintf("stress_task_%d", i),
- })
- }
-
- // Add master syncs throughout
- for i := 0; i < 5; i++ {
- events = append(events, &SimulationEvent{
- Type: EventMasterSync,
- })
- }
-
- // Complete all tasks
- for i := 0; i < 50; i++ {
- events = append(events, &SimulationEvent{
- Type: EventTaskCompleted,
- TaskID: fmt.Sprintf("stress_task_%d", i),
- })
- }
-
- scenario := &StateTestScenario{
- Name: "high_load_stress_test",
- Description: "Tests system under high load with many concurrent operations",
- EventSequence: events,
- Duration: 2 * time.Minute, // Reduced for faster test
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("High load stress test scenario failed: %v", err)
- }
-
- t.Log("✅ High load stress test passed")
-}
-
-func TestComprehensiveSimulation_AllScenarios(t *testing.T) {
- if testing.Short() {
- t.Skip("Skipping comprehensive simulation in short mode")
- }
-
- simulator := NewComprehensiveSimulator()
- simulator.CreateComprehensiveScenarios()
-
- // Run a subset of scenarios for testing (full suite would be too slow)
- testScenarios := []string{
- "volume_creation_during_task",
- "volume_deletion_during_task",
- "shard_creation_race_condition",
- "network_partition_recovery",
- "concurrent_tasks_capacity_tracking",
- }
-
- passedScenarios := 0
- totalScenarios := len(testScenarios)
-
- for _, scenarioName := range testScenarios {
- t.Run(scenarioName, func(t *testing.T) {
- // Find the scenario
- var scenario *StateTestScenario
- for _, s := range simulator.scenarios {
- if s.Name == scenarioName {
- scenario = s
- break
- }
- }
-
- if scenario == nil {
- t.Errorf("Scenario %s not found", scenarioName)
- return
- }
-
- // Reduce duration for faster testing
- scenario.Duration = 15 * time.Second
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("Scenario %s failed: %v", scenarioName, err)
- } else {
- passedScenarios++
- t.Logf("✅ Scenario %s passed", scenarioName)
- }
- })
- }
-
- successRate := float64(passedScenarios) / float64(totalScenarios) * 100.0
- t.Logf("=== COMPREHENSIVE SIMULATION TEST RESULTS ===")
- t.Logf("Scenarios Passed: %d/%d (%.1f%%)", passedScenarios, totalScenarios, successRate)
-
- if successRate < 100.0 {
- t.Errorf("Some scenarios failed. Success rate: %.1f%%", successRate)
- } else {
- t.Log("🎉 All comprehensive simulation scenarios passed!")
- }
-}
-
-func TestComprehensiveSimulation_SimulationFramework(t *testing.T) {
- // Test the simulation framework itself
- simulator := NewComprehensiveSimulator()
-
- // Test event execution
- event := &SimulationEvent{
- Type: EventTaskStarted,
- VolumeID: 1,
- TaskID: "test_task",
- Parameters: map[string]interface{}{
- "type": "vacuum",
- },
- }
-
- err := simulator.executeEvent(event)
- if err != nil {
- t.Errorf("Event execution failed: %v", err)
- }
-
- // Verify task was registered
- if simulator.results.TasksExecuted != 1 {
- t.Errorf("Expected 1 task executed, got %d", simulator.results.TasksExecuted)
- }
-
- // Test event logging
- simulator.logEvent(event)
- if len(simulator.eventLog) != 1 {
- t.Errorf("Expected 1 logged event, got %d", len(simulator.eventLog))
- }
-
- // Test mock master
- simulator.mockMaster.CreateVolume(1, 1024*1024*1024)
- if len(simulator.mockMaster.volumes) != 1 {
- t.Errorf("Expected 1 volume in mock master, got %d", len(simulator.mockMaster.volumes))
- }
-
- t.Log("✅ Simulation framework test passed")
-}
-
-// Integration test that validates the complete state management flow
-func TestComprehensiveSimulation_StateManagementIntegration(t *testing.T) {
- // This test validates the core requirement: accurate volume/shard state tracking
- simulator := NewComprehensiveSimulator()
-
- // Use mock master client instead of nil to avoid nil pointer errors
- simulator.stateManager = task.NewVolumeStateManager(nil) // Skip master client calls for test
-
- // Setup realistic initial state
- initialState := &ClusterState{
- Volumes: map[uint32]*task.VolumeInfo{
- 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024, Server: "server1"}, // Ready for EC
- 2: {ID: 2, Size: 20 * 1024 * 1024 * 1024, Server: "server2", DeletedByteCount: 8 * 1024 * 1024 * 1024}, // Needs vacuum
- },
- ServerCapacity: map[string]*task.CapacityInfo{
- "server1": {Server: "server1", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 30 * 1024 * 1024 * 1024},
- "server2": {Server: "server2", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 25 * 1024 * 1024 * 1024},
- },
- }
-
- // Complex event sequence that tests state consistency (excluding master sync for test)
- eventSequence := []*SimulationEvent{
- // Start EC task on volume 1
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
-
- // Start vacuum task on volume 2
- {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
-
- // EC task creates shards
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
-
- // Vacuum task completes (volume 2 size reduces)
- {Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
- {Type: EventVolumeSizeChanged, VolumeID: 2, Parameters: map[string]interface{}{"new_size": int64(12 * 1024 * 1024 * 1024)}},
-
- // EC task completes
- {Type: EventTaskCompleted, TaskID: "ec_task_1"},
- {Type: EventVolumeReadOnly, VolumeID: 1}, // Volume becomes read-only after EC
- }
-
- scenario := &StateTestScenario{
- Name: "state_management_integration",
- Description: "Complete state management integration test",
- InitialState: initialState,
- EventSequence: eventSequence,
- Duration: 30 * time.Second, // Reduced for faster test
- InconsistencyChecks: []*InconsistencyCheck{
- {Name: "No state inconsistencies", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
- {Name: "No capacity mismatches", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
- {Name: "No orphaned tasks", Type: task.InconsistencyTaskOrphaned, MaxAllowedCount: 0},
- },
- }
-
- err := simulator.RunScenario(scenario)
- if err != nil {
- t.Errorf("State management integration test failed: %v", err)
- }
-
- // Verify final state
- if simulator.results.TasksExecuted != 2 {
- t.Errorf("Expected 2 tasks executed, got %d", simulator.results.TasksExecuted)
- }
-
- if simulator.results.TasksSucceeded != 2 {
- t.Errorf("Expected 2 tasks succeeded, got %d", simulator.results.TasksSucceeded)
- }
-
- t.Log("✅ State management integration test passed")
- t.Log("✅ System accurately tracked volume/shard states throughout complex operation sequence")
-}
-
-// Performance test for simulation framework
-func BenchmarkComprehensiveSimulation_EventExecution(b *testing.B) {
- simulator := NewComprehensiveSimulator()
-
- events := []*SimulationEvent{
- {Type: EventTaskStarted, VolumeID: 1, TaskID: "task_1"},
- {Type: EventVolumeCreated, VolumeID: 2},
- {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
- {Type: EventMasterSync},
- {Type: EventTaskCompleted, TaskID: "task_1"},
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- for _, event := range events {
- simulator.executeEvent(event)
- }
- }
-}
-
-// Helper functions for tests
-func createTestVolumeInfo(id uint32, size uint64) *task.VolumeInfo {
- return &task.VolumeInfo{
- ID: id,
- Size: size,
- }
-}
diff --git a/weed/admin/task/simulation/simulation_runner.go b/weed/admin/task/simulation/simulation_runner.go
deleted file mode 100644
index 339b0edc5..000000000
--- a/weed/admin/task/simulation/simulation_runner.go
+++ /dev/null
@@ -1,294 +0,0 @@
-package simulation
-
-import (
- "fmt"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
-)
-
-// ComprehensiveSimulationRunner orchestrates all comprehensive state management tests
-type ComprehensiveSimulationRunner struct {
- simulator *ComprehensiveSimulator
-}
-
-// NewComprehensiveSimulationRunner creates a new comprehensive simulation runner
-func NewComprehensiveSimulationRunner() *ComprehensiveSimulationRunner {
- return &ComprehensiveSimulationRunner{
- simulator: NewComprehensiveSimulator(),
- }
-}
-
-// RunAllComprehensiveTests runs all comprehensive edge case scenarios
-func (csr *ComprehensiveSimulationRunner) RunAllComprehensiveTests() error {
- glog.Infof("=== STARTING COMPREHENSIVE VOLUME/SHARD STATE MANAGEMENT SIMULATION ===")
-
- // Create all test scenarios
- csr.simulator.CreateComprehensiveScenarios()
-
- // Run all scenarios
- results, err := csr.simulator.RunAllComprehensiveScenarios()
- if err != nil {
- return fmt.Errorf("comprehensive simulation failed: %v", err)
- }
-
- // Analyze results
- csr.analyzeResults(results)
-
- // Generate final report
- csr.generateFinalReport(results)
-
- return nil
-}
-
-// analyzeResults analyzes the simulation results
-func (csr *ComprehensiveSimulationRunner) analyzeResults(results *SimulationResults) {
- glog.Infof("=== ANALYZING COMPREHENSIVE SIMULATION RESULTS ===")
-
- // Check critical errors
- if len(results.CriticalErrors) > 0 {
- glog.Errorf("CRITICAL ISSUES FOUND:")
- for i, err := range results.CriticalErrors {
- glog.Errorf(" %d. %s", i+1, err)
- }
- }
-
- // Check state validation success rate
- totalValidations := results.StateValidationsPassed + results.StateValidationsFailed
- if totalValidations > 0 {
- successRate := float64(results.StateValidationsPassed) / float64(totalValidations) * 100.0
- glog.Infof("State Validation Success Rate: %.2f%% (%d/%d)",
- successRate, results.StateValidationsPassed, totalValidations)
-
- if successRate < 95.0 {
- glog.Warningf("State validation success rate is below 95%% - investigation needed")
- }
- }
-
- // Check task execution success rate
- if results.TasksExecuted > 0 {
- taskSuccessRate := float64(results.TasksSucceeded) / float64(results.TasksExecuted) * 100.0
- glog.Infof("Task Execution Success Rate: %.2f%% (%d/%d)",
- taskSuccessRate, results.TasksSucceeded, results.TasksExecuted)
- }
-
- // Analyze inconsistency patterns
- if len(results.InconsistenciesFound) > 0 {
- glog.Infof("Inconsistency Analysis:")
- for incType, count := range results.InconsistenciesFound {
- if count > 0 {
- glog.Infof(" %s: %d occurrences", incType, count)
- }
- }
- }
-}
-
-// generateFinalReport generates a comprehensive final report
-func (csr *ComprehensiveSimulationRunner) generateFinalReport(results *SimulationResults) {
- glog.Infof("=== COMPREHENSIVE SIMULATION FINAL REPORT ===")
- glog.Infof("Test Duration: %v", results.Duration)
- glog.Infof("Total Events Simulated: %d", results.TotalEvents)
- glog.Infof("Scenarios Tested: %d", len(csr.simulator.scenarios))
- glog.Infof("Overall Success: %v", results.Success)
-
- // Event breakdown
- glog.Infof("\nEvent Breakdown:")
- for eventType, count := range results.EventsByType {
- glog.Infof(" %s: %d", eventType, count)
- }
-
- // Test coverage summary
- glog.Infof("\nTest Coverage Summary:")
- glog.Infof("✓ Volume creation during task execution")
- glog.Infof("✓ Volume deletion during task execution")
- glog.Infof("✓ EC shard creation race conditions")
- glog.Infof("✓ Network partition scenarios")
- glog.Infof("✓ Concurrent task capacity tracking")
- glog.Infof("✓ Complex EC operations with rebuilds")
- glog.Infof("✓ High load stress testing")
- glog.Infof("✓ Master sync timing issues")
- glog.Infof("✓ Worker failure during operations")
- glog.Infof("✓ Capacity overflow handling")
- glog.Infof("✓ Shard corruption scenarios")
- glog.Infof("✓ Master state inconsistencies")
- glog.Infof("✓ Task orphan detection")
- glog.Infof("✓ Duplicate task prevention")
- glog.Infof("✓ Volume state rollback scenarios")
-
- // Quality metrics
- glog.Infof("\nQuality Metrics:")
- if results.StateValidationsPassed > 0 {
- glog.Infof("✓ State consistency maintained across all scenarios")
- }
- if len(results.CriticalErrors) == 0 {
- glog.Infof("✓ No critical errors detected")
- }
- if results.TasksSucceeded > 0 {
- glog.Infof("✓ Task execution reliability verified")
- }
-
- // Recommendations
- glog.Infof("\nRecommendations:")
- if results.Success {
- glog.Infof("✓ The task distribution system is ready for production deployment")
- glog.Infof("✓ All edge cases have been tested and handled correctly")
- glog.Infof("✓ Volume and shard state management is robust and consistent")
- } else {
- glog.Warningf("⚠ System requires additional work before production deployment")
- glog.Warningf("⚠ Address critical errors before proceeding")
- }
-
- glog.Infof("==========================================")
-}
-
-// RunSpecificEdgeCaseTest runs a specific edge case test
-func (csr *ComprehensiveSimulationRunner) RunSpecificEdgeCaseTest(scenarioName string) error {
- glog.Infof("Running specific edge case test: %s", scenarioName)
-
- // Create scenarios if not already done
- if len(csr.simulator.scenarios) == 0 {
- csr.simulator.CreateComprehensiveScenarios()
- }
-
- // Find and run specific scenario
- for _, scenario := range csr.simulator.scenarios {
- if scenario.Name == scenarioName {
- err := csr.simulator.RunScenario(scenario)
- if err != nil {
- return fmt.Errorf("scenario %s failed: %v", scenarioName, err)
- }
- glog.Infof("Scenario %s completed successfully", scenarioName)
- return nil
- }
- }
-
- return fmt.Errorf("scenario %s not found", scenarioName)
-}
-
-// ValidateSystemReadiness performs final validation of system readiness
-func (csr *ComprehensiveSimulationRunner) ValidateSystemReadiness() error {
- glog.Infof("=== VALIDATING SYSTEM READINESS FOR PRODUCTION ===")
-
- checklistItems := []struct {
- name string
- description string
- validator func() error
- }{
- {
- "Volume State Accuracy",
- "Verify volume state tracking is accurate under all conditions",
- csr.validateVolumeStateAccuracy,
- },
- {
- "Shard Management",
- "Verify EC shard creation/deletion/movement is handled correctly",
- csr.validateShardManagement,
- },
- {
- "Capacity Planning",
- "Verify capacity calculations include in-progress and planned operations",
- csr.validateCapacityPlanning,
- },
- {
- "Failure Recovery",
- "Verify system recovers gracefully from all failure scenarios",
- csr.validateFailureRecovery,
- },
- {
- "Consistency Guarantees",
- "Verify state consistency is maintained across all operations",
- csr.validateConsistencyGuarantees,
- },
- }
-
- var failedChecks []string
-
- for _, item := range checklistItems {
- glog.Infof("Validating: %s", item.name)
- if err := item.validator(); err != nil {
- failedChecks = append(failedChecks, fmt.Sprintf("%s: %v", item.name, err))
- glog.Errorf("❌ %s: %v", item.name, err)
- } else {
- glog.Infof("✅ %s: PASSED", item.name)
- }
- }
-
- if len(failedChecks) > 0 {
- return fmt.Errorf("system readiness validation failed: %v", failedChecks)
- }
-
- glog.Infof("🎉 SYSTEM IS READY FOR PRODUCTION DEPLOYMENT!")
- return nil
-}
-
-// Validation methods
-func (csr *ComprehensiveSimulationRunner) validateVolumeStateAccuracy() error {
- // Run volume state accuracy tests
- return csr.RunSpecificEdgeCaseTest("volume_creation_during_task")
-}
-
-func (csr *ComprehensiveSimulationRunner) validateShardManagement() error {
- // Run shard management tests
- return csr.RunSpecificEdgeCaseTest("shard_creation_race_condition")
-}
-
-func (csr *ComprehensiveSimulationRunner) validateCapacityPlanning() error {
- // Run capacity planning tests
- return csr.RunSpecificEdgeCaseTest("concurrent_tasks_capacity_tracking")
-}
-
-func (csr *ComprehensiveSimulationRunner) validateFailureRecovery() error {
- // Run failure recovery tests
- return csr.RunSpecificEdgeCaseTest("network_partition_recovery")
-}
-
-func (csr *ComprehensiveSimulationRunner) validateConsistencyGuarantees() error {
- // Run consistency tests
- return csr.RunSpecificEdgeCaseTest("complex_ec_operation")
-}
-
-// DemonstrateBugPrevention shows how the simulation prevents bugs
-func (csr *ComprehensiveSimulationRunner) DemonstrateBugPrevention() {
- glog.Infof("=== DEMONSTRATING BUG PREVENTION CAPABILITIES ===")
-
- bugScenarios := []struct {
- name string
- description string
- impact string
- }{
- {
- "Race Condition Prevention",
- "Master sync occurs while EC shards are being created",
- "Prevents state inconsistencies that could lead to data loss",
- },
- {
- "Capacity Overflow Prevention",
- "Multiple tasks assigned without considering cumulative capacity impact",
- "Prevents server disk space exhaustion",
- },
- {
- "Orphaned Task Detection",
- "Worker fails but task remains marked as in-progress",
- "Prevents volumes from being stuck in intermediate states",
- },
- {
- "Duplicate Task Prevention",
- "Same volume assigned to multiple workers simultaneously",
- "Prevents data corruption from conflicting operations",
- },
- {
- "Network Partition Handling",
- "Admin server loses connection to master during operations",
- "Ensures eventual consistency when connectivity is restored",
- },
- }
-
- for i, scenario := range bugScenarios {
- glog.Infof("%d. %s", i+1, scenario.name)
- glog.Infof(" Scenario: %s", scenario.description)
- glog.Infof(" Impact Prevention: %s", scenario.impact)
- glog.Infof("")
- }
-
- glog.Infof("✅ All potential bugs are detected and prevented by the simulation framework")
- glog.Infof("✅ The system is thoroughly validated for production use")
-}
diff --git a/weed/admin/task/simulation/system_demo_test.go b/weed/admin/task/simulation/system_demo_test.go
deleted file mode 100644
index 7cf095d0e..000000000
--- a/weed/admin/task/simulation/system_demo_test.go
+++ /dev/null
@@ -1,237 +0,0 @@
-package simulation
-
-import (
- "testing"
-
- "github.com/seaweedfs/seaweedfs/weed/admin/task"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestSystemDemo demonstrates the complete working system
-func TestSystemDemo(t *testing.T) {
- t.Log("🚀 SEAWEEDFS TASK DISTRIBUTION SYSTEM DEMONSTRATION")
- t.Log("====================================================")
-
- // Test 1: Volume State Management
- t.Log("\n📊 1. VOLUME STATE MANAGEMENT")
- testVolumeStateManagement(t)
-
- // Test 2: Task Assignment Logic
- t.Log("\n⚡ 2. TASK ASSIGNMENT LOGIC")
- testTaskAssignment(t)
-
- // Test 3: Capacity Management
- t.Log("\n💾 3. CAPACITY MANAGEMENT")
- testCapacityManagement(t)
-
- // Test 4: Edge Case Handling
- t.Log("\n🛡️ 4. EDGE CASE HANDLING")
- testEdgeCaseHandling(t)
-
- t.Log("\n🎉 SYSTEM DEMONSTRATION COMPLETE")
- t.Log("✅ All core features working correctly")
- t.Log("✅ System ready for production deployment")
-}
-
-func testVolumeStateManagement(t *testing.T) {
- vsm := task.NewVolumeStateManager(nil)
-
- // Create volume
- volumeID := uint32(1)
-
- // Register task impact
- impact := &task.TaskImpact{
- TaskID: "ec_task_1",
- VolumeID: volumeID,
- TaskType: types.TaskTypeErasureCoding,
- VolumeChanges: &task.VolumeChanges{
- WillBecomeReadOnly: true,
- },
- CapacityDelta: map[string]int64{"server1": 12 * 1024 * 1024 * 1024}, // 12GB
- }
-
- vsm.RegisterTaskImpact(impact.TaskID, impact)
-
- t.Log(" ✅ Volume state registration works")
- t.Log(" ✅ Task impact tracking works")
- t.Log(" ✅ State consistency maintained")
-}
-
-func testTaskAssignment(t *testing.T) {
- registry := task.NewWorkerRegistry()
- queue := task.NewPriorityTaskQueue()
- scheduler := task.NewTaskScheduler(registry, queue)
-
- // Register worker
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Create task
- taskItem := &types.Task{
- ID: "vacuum_task_1",
- Type: types.TaskTypeVacuum,
- Priority: types.TaskPriorityNormal,
- }
- queue.Push(taskItem)
-
- // Test assignment
- assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if assignedTask == nil {
- t.Error("❌ Task assignment failed")
- return
- }
-
- if assignedTask.ID != "vacuum_task_1" {
- t.Errorf("❌ Wrong task assigned: expected vacuum_task_1, got %s", assignedTask.ID)
- return
- }
-
- t.Log(" ✅ Worker registration works")
- t.Log(" ✅ Task queueing works")
- t.Log(" ✅ Task assignment logic works")
- t.Log(" ✅ Capability matching works")
-}
-
-func testCapacityManagement(t *testing.T) {
- vsm := task.NewVolumeStateManager(nil)
-
- // Note: We can't directly set capacityCache due to private fields,
- // but we can test the public interface
-
- // Test capacity checking with a made-up scenario
- serverID := "test_server"
-
- // This would normally fail since we can't set the capacity cache,
- // but we can demonstrate the interface
- canAssign := vsm.CanAssignVolumeToServer(5*1024*1024*1024, serverID)
-
- // Since we can't set up the test data properly due to private fields,
- // we'll just verify the method works without error
- _ = canAssign
-
- t.Log(" ✅ Capacity calculation interface works")
- t.Log(" ✅ Reserved capacity tracking interface works")
- t.Log(" ✅ Assignment constraints interface works")
-}
-
-func testEdgeCaseHandling(t *testing.T) {
- // Test empty queue
- registry := task.NewWorkerRegistry()
- queue := task.NewPriorityTaskQueue()
- scheduler := task.NewTaskScheduler(registry, queue)
-
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- }
- registry.RegisterWorker(worker)
-
- // Empty queue should return nil
- taskItem := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if taskItem != nil {
- t.Error("❌ Empty queue should return nil")
- return
- }
-
- // Test unknown worker
- unknownTask := scheduler.GetNextTask("unknown", []types.TaskType{types.TaskTypeVacuum})
- if unknownTask != nil {
- t.Error("❌ Unknown worker should not get tasks")
- return
- }
-
- t.Log(" ✅ Empty queue handled correctly")
- t.Log(" ✅ Unknown worker handled correctly")
- t.Log(" ✅ Edge cases properly managed")
-}
-
-// TestSystemCapabilities demonstrates key system capabilities
-func TestSystemCapabilities(t *testing.T) {
- t.Log("\n🎯 SEAWEEDFS TASK DISTRIBUTION SYSTEM CAPABILITIES")
- t.Log("==================================================")
-
- capabilities := []string{
- "✅ Comprehensive volume/shard state tracking",
- "✅ Accurate capacity planning with reservations",
- "✅ Task assignment based on worker capabilities",
- "✅ Priority-based task scheduling",
- "✅ Concurrent task management",
- "✅ EC shard lifecycle tracking",
- "✅ Capacity overflow prevention",
- "✅ Duplicate task prevention",
- "✅ Worker performance metrics",
- "✅ Failure detection and recovery",
- "✅ State reconciliation with master",
- "✅ Comprehensive simulation framework",
- "✅ Production-ready error handling",
- "✅ Scalable distributed architecture",
- "✅ Real-time progress monitoring",
- }
-
- for _, capability := range capabilities {
- t.Log(" " + capability)
- }
-
- t.Log("\n📈 SYSTEM METRICS")
- t.Log(" Total Lines of Code: 4,919")
- t.Log(" Test Coverage: Comprehensive")
- t.Log(" Edge Cases: 15+ scenarios tested")
- t.Log(" Simulation Framework: Complete")
- t.Log(" Production Ready: ✅ YES")
-
- t.Log("\n🚀 READY FOR PRODUCTION DEPLOYMENT!")
-}
-
-// TestBugPrevention demonstrates how the system prevents common bugs
-func TestBugPrevention(t *testing.T) {
- t.Log("\n🛡️ BUG PREVENTION DEMONSTRATION")
- t.Log("================================")
-
- bugScenarios := []struct {
- name string
- description string
- prevention string
- }{
- {
- "Race Conditions",
- "Master sync during shard creation",
- "State manager tracks in-progress changes",
- },
- {
- "Capacity Overflow",
- "Multiple tasks overwhelming server disk",
- "Reserved capacity tracking prevents overflow",
- },
- {
- "Orphaned Tasks",
- "Worker fails, task stuck in-progress",
- "Timeout detection and automatic cleanup",
- },
- {
- "Duplicate Tasks",
- "Same volume assigned to multiple workers",
- "Volume reservation prevents conflicts",
- },
- {
- "State Inconsistency",
- "Admin view diverges from master",
- "Periodic reconciliation ensures consistency",
- },
- }
-
- for i, scenario := range bugScenarios {
- t.Logf(" %d. %s", i+1, scenario.name)
- t.Logf(" Problem: %s", scenario.description)
- t.Logf(" Solution: %s", scenario.prevention)
- t.Log("")
- }
-
- t.Log("✅ All major bug categories prevented through design")
-}
diff --git a/weed/admin/task/task_assignment_test.go b/weed/admin/task/task_assignment_test.go
deleted file mode 100644
index 0f9f41f16..000000000
--- a/weed/admin/task/task_assignment_test.go
+++ /dev/null
@@ -1,509 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-func TestTaskAssignment_BasicAssignment(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Register worker
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Create task
- task := &types.Task{
- ID: "task1",
- Type: types.TaskTypeVacuum,
- Priority: types.TaskPriorityNormal,
- }
- queue.Push(task)
-
- // Test assignment
- nextTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if nextTask == nil {
- t.Fatal("Expected task to be assigned")
- }
-
- if nextTask.ID != "task1" {
- t.Errorf("Expected task1, got %s", nextTask.ID)
- }
-
- t.Log("✅ Basic task assignment test passed")
-}
-
-func TestTaskAssignment_CapabilityMatching(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Register workers with different capabilities
- ecWorker := &types.Worker{
- ID: "ec_worker",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(ecWorker)
-
- vacuumWorker := &types.Worker{
- ID: "vacuum_worker",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(vacuumWorker)
-
- // Create different types of tasks
- ecTask := &types.Task{
- ID: "ec_task",
- Type: types.TaskTypeErasureCoding,
- }
- vacuumTask := &types.Task{
- ID: "vacuum_task",
- Type: types.TaskTypeVacuum,
- }
-
- queue.Push(ecTask)
- queue.Push(vacuumTask)
-
- // Test EC worker gets EC task
- assignedECTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeErasureCoding})
- if assignedECTask == nil || assignedECTask.Type != types.TaskTypeErasureCoding {
- t.Error("EC worker should get EC task")
- }
-
- // Test vacuum worker gets vacuum task
- assignedVacuumTask := scheduler.GetNextTask("vacuum_worker", []types.TaskType{types.TaskTypeVacuum})
- if assignedVacuumTask == nil || assignedVacuumTask.Type != types.TaskTypeVacuum {
- t.Error("Vacuum worker should get vacuum task")
- }
-
- // Test wrong capability - should get nothing
- wrongTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeVacuum})
- if wrongTask != nil {
- t.Error("EC worker should not get vacuum task")
- }
-
- t.Log("✅ Capability matching test passed")
-}
-
-func TestTaskAssignment_PriorityOrdering(t *testing.T) {
- queue := NewPriorityTaskQueue()
-
- // Add tasks in reverse priority order
- lowTask := &types.Task{
- ID: "low_task",
- Priority: types.TaskPriorityLow,
- }
- highTask := &types.Task{
- ID: "high_task",
- Priority: types.TaskPriorityHigh,
- }
- normalTask := &types.Task{
- ID: "normal_task",
- Priority: types.TaskPriorityNormal,
- }
-
- queue.Push(lowTask)
- queue.Push(normalTask)
- queue.Push(highTask)
-
- // Should get high priority first
- first := queue.Pop()
- if first.Priority != types.TaskPriorityHigh {
- t.Errorf("Expected high priority first, got %d", first.Priority)
- }
-
- // Then normal priority
- second := queue.Pop()
- if second.Priority != types.TaskPriorityNormal {
- t.Errorf("Expected normal priority second, got %d", second.Priority)
- }
-
- // Finally low priority
- third := queue.Pop()
- if third.Priority != types.TaskPriorityLow {
- t.Errorf("Expected low priority third, got %d", third.Priority)
- }
-
- t.Log("✅ Priority ordering test passed")
-}
-
-func TestTaskAssignment_WorkerCapacityLimits(t *testing.T) {
- registry := NewWorkerRegistry()
-
- // Register worker with limited capacity
- worker := &types.Worker{
- ID: "limited_worker",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 2, // Already at capacity
- }
- registry.RegisterWorker(worker)
-
- // Worker should not be available
- availableWorkers := registry.GetAvailableWorkers()
- if len(availableWorkers) != 0 {
- t.Error("Worker at capacity should not be available")
- }
-
- // Reduce load
- worker.CurrentLoad = 1
-
- // Worker should now be available
- availableWorkers = registry.GetAvailableWorkers()
- if len(availableWorkers) != 1 {
- t.Error("Worker with capacity should be available")
- }
-
- t.Log("✅ Worker capacity limits test passed")
-}
-
-func TestTaskAssignment_ScheduledTasks(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Create task scheduled for future
- futureTask := &types.Task{
- ID: "future_task",
- Type: types.TaskTypeVacuum,
- ScheduledAt: time.Now().Add(1 * time.Hour), // 1 hour from now
- }
-
- // Create task ready now
- readyTask := &types.Task{
- ID: "ready_task",
- Type: types.TaskTypeVacuum,
- ScheduledAt: time.Now().Add(-1 * time.Minute), // 1 minute ago
- }
-
- queue.Push(futureTask)
- queue.Push(readyTask)
-
- // Should get ready task, not future task
- assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if assignedTask == nil || assignedTask.ID != "ready_task" {
- t.Error("Should assign ready task, not future scheduled task")
- }
-
- t.Log("✅ Scheduled tasks test passed")
-}
-
-func TestTaskAssignment_WorkerSelection(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Register workers with different characteristics
- highPerformanceWorker := &types.Worker{
- ID: "high_perf_worker",
- Address: "server1",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- Status: "active",
- CurrentLoad: 0,
- MaxConcurrent: 4,
- }
-
- lowPerformanceWorker := &types.Worker{
- ID: "low_perf_worker",
- Address: "server2",
- Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
- Status: "active",
- CurrentLoad: 1,
- MaxConcurrent: 2,
- }
-
- registry.RegisterWorker(highPerformanceWorker)
- registry.RegisterWorker(lowPerformanceWorker)
-
- // Set up metrics to favor high performance worker
- registry.metrics[highPerformanceWorker.ID] = &WorkerMetrics{
- TasksCompleted: 100,
- TasksFailed: 5,
- SuccessRate: 0.95,
- AverageTaskTime: 10 * time.Minute,
- LastTaskTime: time.Now().Add(-5 * time.Minute),
- }
-
- registry.metrics[lowPerformanceWorker.ID] = &WorkerMetrics{
- TasksCompleted: 50,
- TasksFailed: 10,
- SuccessRate: 0.83,
- AverageTaskTime: 20 * time.Minute,
- LastTaskTime: time.Now().Add(-1 * time.Hour),
- }
-
- // Create high priority task
- task := &types.Task{
- ID: "important_task",
- Type: types.TaskTypeErasureCoding,
- Priority: types.TaskPriorityHigh,
- Server: "server1", // Prefers server1
- }
-
- availableWorkers := []*types.Worker{highPerformanceWorker, lowPerformanceWorker}
- selectedWorker := scheduler.SelectWorker(task, availableWorkers)
-
- if selectedWorker == nil {
- t.Fatal("No worker selected")
- }
-
- if selectedWorker.ID != "high_perf_worker" {
- t.Errorf("Expected high performance worker to be selected, got %s", selectedWorker.ID)
- }
-
- t.Log("✅ Worker selection test passed")
-}
-
-func TestTaskAssignment_ServerAffinity(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Workers on different servers
- worker1 := &types.Worker{
- ID: "worker1",
- Address: "server1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: 0,
- }
-
- worker2 := &types.Worker{
- ID: "worker2",
- Address: "server2",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: 0,
- }
-
- registry.RegisterWorker(worker1)
- registry.RegisterWorker(worker2)
-
- // Task that prefers server1
- task := &types.Task{
- ID: "affinity_task",
- Type: types.TaskTypeVacuum,
- Server: "server1", // Should prefer worker on server1
- }
-
- availableWorkers := []*types.Worker{worker1, worker2}
- selectedWorker := scheduler.SelectWorker(task, availableWorkers)
-
- if selectedWorker == nil {
- t.Fatal("No worker selected")
- }
-
- if selectedWorker.Address != "server1" {
- t.Errorf("Expected worker on server1 to be selected for server affinity")
- }
-
- t.Log("✅ Server affinity test passed")
-}
-
-func TestTaskAssignment_DuplicateTaskPrevention(t *testing.T) {
- queue := NewPriorityTaskQueue()
-
- // Add initial task
- task1 := &types.Task{
- ID: "task1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1,
- }
- queue.Push(task1)
-
- // Check for duplicate
- hasDuplicate := queue.HasTask(1, types.TaskTypeVacuum)
- if !hasDuplicate {
- t.Error("Should detect existing task for volume")
- }
-
- // Check for non-existent task
- hasNonExistent := queue.HasTask(2, types.TaskTypeVacuum)
- if hasNonExistent {
- t.Error("Should not detect task for different volume")
- }
-
- // Check for different task type
- hasDifferentType := queue.HasTask(1, types.TaskTypeErasureCoding)
- if hasDifferentType {
- t.Error("Should not detect different task type for same volume")
- }
-
- t.Log("✅ Duplicate task prevention test passed")
-}
-
-func TestTaskAssignment_TaskRemoval(t *testing.T) {
- queue := NewPriorityTaskQueue()
-
- // Add tasks
- task1 := &types.Task{ID: "task1", Priority: types.TaskPriorityNormal}
- task2 := &types.Task{ID: "task2", Priority: types.TaskPriorityHigh}
- task3 := &types.Task{ID: "task3", Priority: types.TaskPriorityLow}
-
- queue.Push(task1)
- queue.Push(task2)
- queue.Push(task3)
-
- if queue.Size() != 3 {
- t.Errorf("Expected queue size 3, got %d", queue.Size())
- }
-
- // Remove middle priority task
- removed := queue.RemoveTask("task1")
- if !removed {
- t.Error("Should have removed task1")
- }
-
- if queue.Size() != 2 {
- t.Errorf("Expected queue size 2 after removal, got %d", queue.Size())
- }
-
- // Verify order maintained (high priority first)
- next := queue.Peek()
- if next.ID != "task2" {
- t.Errorf("Expected task2 (high priority) to be next, got %s", next.ID)
- }
-
- t.Log("✅ Task removal test passed")
-}
-
-func TestTaskAssignment_EdgeCases(t *testing.T) {
- t.Run("EmptyQueue", func(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- worker := &types.Worker{
- ID: "worker1",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- }
- registry.RegisterWorker(worker)
-
- // Empty queue should return nil
- task := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
- if task != nil {
- t.Error("Empty queue should return nil task")
- }
- })
-
- t.Run("UnknownWorker", func(t *testing.T) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- task := &types.Task{ID: "task1", Type: types.TaskTypeVacuum}
- queue.Push(task)
-
- // Unknown worker should return nil
- assignedTask := scheduler.GetNextTask("unknown_worker", []types.TaskType{types.TaskTypeVacuum})
- if assignedTask != nil {
- t.Error("Unknown worker should not get tasks")
- }
- })
-
- t.Run("InactiveWorker", func(t *testing.T) {
- registry := NewWorkerRegistry()
-
- worker := &types.Worker{
- ID: "inactive_worker",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "inactive",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Inactive worker should not be available
- available := registry.GetAvailableWorkers()
- if len(available) != 0 {
- t.Error("Inactive worker should not be available")
- }
- })
-
- t.Log("✅ Edge cases test passed")
-}
-
-// Performance test for task assignment
-func BenchmarkTaskAssignment_GetNextTask(b *testing.B) {
- registry := NewWorkerRegistry()
- queue := NewPriorityTaskQueue()
- scheduler := NewTaskScheduler(registry, queue)
-
- // Setup worker
- worker := &types.Worker{
- ID: "bench_worker",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: 0,
- }
- registry.RegisterWorker(worker)
-
- // Add many tasks
- for i := 0; i < 1000; i++ {
- task := &types.Task{
- ID: fmt.Sprintf("task_%d", i),
- Type: types.TaskTypeVacuum,
- Priority: types.TaskPriorityNormal,
- }
- queue.Push(task)
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- scheduler.GetNextTask("bench_worker", []types.TaskType{types.TaskTypeVacuum})
- }
-}
-
-func BenchmarkTaskAssignment_WorkerSelection(b *testing.B) {
- registry := NewWorkerRegistry()
- scheduler := NewTaskScheduler(registry, nil)
-
- // Create many workers
- workers := make([]*types.Worker, 100)
- for i := 0; i < 100; i++ {
- worker := &types.Worker{
- ID: fmt.Sprintf("worker_%d", i),
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- Status: "active",
- CurrentLoad: i % 3, // Varying loads
- }
- registry.RegisterWorker(worker)
- workers[i] = worker
- }
-
- task := &types.Task{
- ID: "bench_task",
- Type: types.TaskTypeVacuum,
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- scheduler.SelectWorker(task, workers)
- }
-}
diff --git a/weed/admin/task/task_detectors.go b/weed/admin/task/task_detectors.go
deleted file mode 100644
index 4e70fb475..000000000
--- a/weed/admin/task/task_detectors.go
+++ /dev/null
@@ -1,168 +0,0 @@
-package task
-
-import (
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// ECDetector detects volumes that need erasure coding
-type ECDetector struct {
- minUtilization float64
- minIdleTime time.Duration
-}
-
-// NewECDetector creates a new EC detector
-func NewECDetector() *ECDetector {
- return &ECDetector{
- minUtilization: 95.0, // 95% full
- minIdleTime: time.Hour, // 1 hour idle
- }
-}
-
-// DetectECCandidates finds volumes that need erasure coding
-func (ed *ECDetector) DetectECCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) {
- var candidates []*VolumeCandidate
-
- for _, vol := range volumes {
- if ed.isECCandidate(vol) {
- candidate := &VolumeCandidate{
- VolumeID: vol.ID,
- Server: vol.Server,
- Collection: vol.Collection,
- TaskType: types.TaskTypeErasureCoding,
- Priority: ed.calculateECPriority(vol),
- Reason: "Volume is full and idle, ready for erasure coding",
- DetectedAt: time.Now(),
- ScheduleAt: time.Now(),
- Parameters: map[string]interface{}{
- "utilization": vol.GetUtilization(),
- "idle_time": vol.GetIdleTime().String(),
- "volume_size": vol.Size,
- },
- }
- candidates = append(candidates, candidate)
- }
- }
-
- glog.V(2).Infof("EC detector found %d candidates", len(candidates))
- return candidates, nil
-}
-
-// isECCandidate checks if a volume is suitable for EC
-func (ed *ECDetector) isECCandidate(vol *VolumeInfo) bool {
- // Skip if read-only
- if vol.ReadOnly {
- return false
- }
-
- // Skip if already has remote storage (likely already EC'd)
- if vol.RemoteStorageKey != "" {
- return false
- }
-
- // Check utilization
- if vol.GetUtilization() < ed.minUtilization {
- return false
- }
-
- // Check idle time
- if vol.GetIdleTime() < ed.minIdleTime {
- return false
- }
-
- return true
-}
-
-// calculateECPriority calculates priority for EC tasks
-func (ed *ECDetector) calculateECPriority(vol *VolumeInfo) types.TaskPriority {
- utilization := vol.GetUtilization()
- idleTime := vol.GetIdleTime()
-
- // Higher priority for fuller volumes that have been idle longer
- if utilization >= 98.0 && idleTime > 24*time.Hour {
- return types.TaskPriorityHigh
- }
- if utilization >= 96.0 && idleTime > 6*time.Hour {
- return types.TaskPriorityNormal
- }
- return types.TaskPriorityLow
-}
-
-// VacuumDetector detects volumes that need vacuum operations
-type VacuumDetector struct {
- minGarbageRatio float64
- minDeleteCount uint64
-}
-
-// NewVacuumDetector creates a new vacuum detector
-func NewVacuumDetector() *VacuumDetector {
- return &VacuumDetector{
- minGarbageRatio: 0.3, // 30% garbage
- minDeleteCount: 100, // At least 100 deleted files
- }
-}
-
-// DetectVacuumCandidates finds volumes that need vacuum operations
-func (vd *VacuumDetector) DetectVacuumCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) {
- var candidates []*VolumeCandidate
-
- for _, vol := range volumes {
- if vd.isVacuumCandidate(vol) {
- candidate := &VolumeCandidate{
- VolumeID: vol.ID,
- Server: vol.Server,
- Collection: vol.Collection,
- TaskType: types.TaskTypeVacuum,
- Priority: vd.calculateVacuumPriority(vol),
- Reason: "Volume has high garbage ratio and needs vacuum",
- DetectedAt: time.Now(),
- ScheduleAt: time.Now(),
- Parameters: map[string]interface{}{
- "garbage_ratio": vol.GetGarbageRatio(),
- "delete_count": vol.DeleteCount,
- "deleted_byte_count": vol.DeletedByteCount,
- },
- }
- candidates = append(candidates, candidate)
- }
- }
-
- glog.V(2).Infof("Vacuum detector found %d candidates", len(candidates))
- return candidates, nil
-}
-
-// isVacuumCandidate checks if a volume needs vacuum
-func (vd *VacuumDetector) isVacuumCandidate(vol *VolumeInfo) bool {
- // Skip if read-only
- if vol.ReadOnly {
- return false
- }
-
- // Check garbage ratio
- if vol.GetGarbageRatio() < vd.minGarbageRatio {
- return false
- }
-
- // Check delete count
- if vol.DeleteCount < vd.minDeleteCount {
- return false
- }
-
- return true
-}
-
-// calculateVacuumPriority calculates priority for vacuum tasks
-func (vd *VacuumDetector) calculateVacuumPriority(vol *VolumeInfo) types.TaskPriority {
- garbageRatio := vol.GetGarbageRatio()
-
- // Higher priority for volumes with more garbage
- if garbageRatio >= 0.6 {
- return types.TaskPriorityHigh
- }
- if garbageRatio >= 0.4 {
- return types.TaskPriorityNormal
- }
- return types.TaskPriorityLow
-}
diff --git a/weed/admin/task/task_discovery.go b/weed/admin/task/task_discovery.go
deleted file mode 100644
index 285a453a9..000000000
--- a/weed/admin/task/task_discovery.go
+++ /dev/null
@@ -1,161 +0,0 @@
-package task
-
-import (
- "context"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
-)
-
-// TaskDiscoveryEngine discovers volumes that need maintenance tasks
-type TaskDiscoveryEngine struct {
- masterClient *wdclient.MasterClient
- scanInterval time.Duration
- ecDetector *ECDetector
- vacuumDetector *VacuumDetector
-}
-
-// NewTaskDiscoveryEngine creates a new task discovery engine
-func NewTaskDiscoveryEngine(masterClient *wdclient.MasterClient, scanInterval time.Duration) *TaskDiscoveryEngine {
- return &TaskDiscoveryEngine{
- masterClient: masterClient,
- scanInterval: scanInterval,
- ecDetector: NewECDetector(),
- vacuumDetector: NewVacuumDetector(),
- }
-}
-
-// ScanForTasks scans for volumes that need maintenance tasks
-func (tde *TaskDiscoveryEngine) ScanForTasks() ([]*VolumeCandidate, error) {
- var candidates []*VolumeCandidate
-
- // Get cluster topology and volume information
- volumeInfos, err := tde.getVolumeInformation()
- if err != nil {
- return nil, err
- }
-
- // Scan for EC candidates
- ecCandidates, err := tde.ecDetector.DetectECCandidates(volumeInfos)
- if err != nil {
- glog.Errorf("EC detection failed: %v", err)
- } else {
- candidates = append(candidates, ecCandidates...)
- }
-
- // Scan for vacuum candidates
- vacuumCandidates, err := tde.vacuumDetector.DetectVacuumCandidates(volumeInfos)
- if err != nil {
- glog.Errorf("Vacuum detection failed: %v", err)
- } else {
- candidates = append(candidates, vacuumCandidates...)
- }
-
- glog.V(1).Infof("Task discovery found %d candidates (%d EC, %d vacuum)",
- len(candidates), len(ecCandidates), len(vacuumCandidates))
-
- return candidates, nil
-}
-
-// getVolumeInformation retrieves volume information from master
-func (tde *TaskDiscoveryEngine) getVolumeInformation() ([]*VolumeInfo, error) {
- var volumeInfos []*VolumeInfo
-
- err := tde.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
- resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
- if err != nil {
- return err
- }
-
- if resp.TopologyInfo != nil {
- for _, dc := range resp.TopologyInfo.DataCenterInfos {
- for _, rack := range dc.RackInfos {
- for _, node := range rack.DataNodeInfos {
- for _, diskInfo := range node.DiskInfos {
- for _, volInfo := range diskInfo.VolumeInfos {
- volumeInfo := &VolumeInfo{
- ID: volInfo.Id,
- Size: volInfo.Size,
- Collection: volInfo.Collection,
- FileCount: volInfo.FileCount,
- DeleteCount: volInfo.DeleteCount,
- DeletedByteCount: volInfo.DeletedByteCount,
- ReadOnly: volInfo.ReadOnly,
- Server: node.Id,
- DataCenter: dc.Id,
- Rack: rack.Id,
- DiskType: volInfo.DiskType,
- ModifiedAtSecond: volInfo.ModifiedAtSecond,
- RemoteStorageKey: volInfo.RemoteStorageKey,
- }
- volumeInfos = append(volumeInfos, volumeInfo)
- }
- }
- }
- }
- }
- }
-
- return nil
- })
-
- return volumeInfos, err
-}
-
-// VolumeInfo contains detailed volume information
-type VolumeInfo struct {
- ID uint32
- Size uint64
- Collection string
- FileCount uint64
- DeleteCount uint64
- DeletedByteCount uint64
- ReadOnly bool
- Server string
- DataCenter string
- Rack string
- DiskType string
- ModifiedAtSecond int64
- RemoteStorageKey string
-}
-
-// GetUtilization calculates volume utilization percentage
-func (vi *VolumeInfo) GetUtilization() float64 {
- if vi.Size == 0 {
- return 0.0
- }
- // Assuming max volume size of 30GB
- maxSize := uint64(30 * 1024 * 1024 * 1024)
- return float64(vi.Size) / float64(maxSize) * 100.0
-}
-
-// GetGarbageRatio calculates the garbage ratio
-func (vi *VolumeInfo) GetGarbageRatio() float64 {
- if vi.Size == 0 {
- return 0.0
- }
- return float64(vi.DeletedByteCount) / float64(vi.Size)
-}
-
-// GetIdleTime calculates how long the volume has been idle
-func (vi *VolumeInfo) GetIdleTime() time.Duration {
- lastModified := time.Unix(vi.ModifiedAtSecond, 0)
- return time.Since(lastModified)
-}
-
-// IsECCandidate checks if volume is a candidate for EC
-func (vi *VolumeInfo) IsECCandidate() bool {
- return !vi.ReadOnly &&
- vi.GetUtilization() >= 95.0 &&
- vi.GetIdleTime() > time.Hour &&
- vi.RemoteStorageKey == "" // Not already EC'd
-}
-
-// IsVacuumCandidate checks if volume is a candidate for vacuum
-func (vi *VolumeInfo) IsVacuumCandidate() bool {
- return !vi.ReadOnly &&
- vi.GetGarbageRatio() >= 0.3 &&
- vi.DeleteCount > 0
-}
diff --git a/weed/admin/task/task_scheduler.go b/weed/admin/task/task_scheduler.go
deleted file mode 100644
index 6a7fecfc9..000000000
--- a/weed/admin/task/task_scheduler.go
+++ /dev/null
@@ -1,257 +0,0 @@
-package task
-
-import (
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TaskScheduler handles task assignment to workers
-type TaskScheduler struct {
- workerRegistry *WorkerRegistry
- taskQueue *PriorityTaskQueue
- mutex sync.RWMutex
-}
-
-// NewTaskScheduler creates a new task scheduler
-func NewTaskScheduler(registry *WorkerRegistry, queue *PriorityTaskQueue) *TaskScheduler {
- return &TaskScheduler{
- workerRegistry: registry,
- taskQueue: queue,
- }
-}
-
-// GetNextTask gets the next suitable task for a worker
-func (ts *TaskScheduler) GetNextTask(workerID string, capabilities []types.TaskType) *types.Task {
- ts.mutex.RLock()
- defer ts.mutex.RUnlock()
-
- // Get worker info
- _, exists := ts.workerRegistry.GetWorker(workerID)
- if !exists {
- return nil
- }
-
- // Check worker capabilities
- capabilityMap := make(map[types.TaskType]bool)
- for _, cap := range capabilities {
- capabilityMap[cap] = true
- }
-
- // Find next suitable task
- tasks := ts.taskQueue.GetTasks()
- for _, task := range tasks {
- // Check if worker can handle this task type
- if !capabilityMap[task.Type] {
- continue
- }
-
- // Check if task is ready to be scheduled
- if !task.ScheduledAt.IsZero() && task.ScheduledAt.After(time.Now()) {
- continue
- }
-
- // Additional checks can be added here
- // (e.g., server affinity, resource requirements)
-
- return task
- }
-
- return nil
-}
-
-// SelectWorker selects the best worker for a task
-func (ts *TaskScheduler) SelectWorker(task *types.Task, availableWorkers []*types.Worker) *types.Worker {
- ts.mutex.RLock()
- defer ts.mutex.RUnlock()
-
- var bestWorker *types.Worker
- bestScore := -1.0
-
- for _, worker := range availableWorkers {
- // Check if worker supports this task type
- if !ts.workerSupportsTask(worker, task.Type) {
- continue
- }
-
- // Calculate selection score
- score := ts.calculateSelectionScore(worker, task)
- if bestWorker == nil || score > bestScore {
- bestWorker = worker
- bestScore = score
- }
- }
-
- if bestWorker != nil {
- glog.V(2).Infof("Selected worker %s for task %s (score: %.2f)", bestWorker.ID, task.Type, bestScore)
- }
-
- return bestWorker
-}
-
-// workerSupportsTask checks if a worker supports a task type
-func (ts *TaskScheduler) workerSupportsTask(worker *types.Worker, taskType types.TaskType) bool {
- for _, capability := range worker.Capabilities {
- if capability == taskType {
- return true
- }
- }
- return false
-}
-
-// calculateSelectionScore calculates a score for worker selection
-func (ts *TaskScheduler) calculateSelectionScore(worker *types.Worker, task *types.Task) float64 {
- // Base score from worker registry
- baseScore := ts.workerRegistry.calculateWorkerScore(worker)
-
- // Task-specific adjustments
- taskScore := baseScore
-
- // Priority adjustment
- switch task.Priority {
- case types.TaskPriorityHigh:
- taskScore *= 1.2 // Prefer high-performing workers for high-priority tasks
- case types.TaskPriorityLow:
- taskScore *= 0.9 // Low-priority tasks can use any available worker
- }
-
- // Server affinity bonus (if worker and volume are on same server)
- if task.Server != "" && worker.Address == task.Server {
- taskScore += 0.1
- }
-
- // Retry penalty (prefer different workers for retried tasks)
- if task.RetryCount > 0 {
- taskScore *= 0.8
- }
-
- return taskScore
-}
-
-// PriorityTaskQueue implements a priority queue for tasks
-type PriorityTaskQueue struct {
- tasks []*types.Task
- mutex sync.RWMutex
-}
-
-// NewPriorityTaskQueue creates a new priority task queue
-func NewPriorityTaskQueue() *PriorityTaskQueue {
- return &PriorityTaskQueue{
- tasks: make([]*types.Task, 0),
- }
-}
-
-// Push adds a task to the queue
-func (ptq *PriorityTaskQueue) Push(task *types.Task) {
- ptq.mutex.Lock()
- defer ptq.mutex.Unlock()
-
- // Insert task in priority order (highest priority first)
- inserted := false
- for i, existingTask := range ptq.tasks {
- if task.Priority > existingTask.Priority {
- // Insert at position i
- ptq.tasks = append(ptq.tasks[:i], append([]*types.Task{task}, ptq.tasks[i:]...)...)
- inserted = true
- break
- }
- }
-
- if !inserted {
- // Add to end
- ptq.tasks = append(ptq.tasks, task)
- }
-
- glog.V(3).Infof("Added task %s to queue (priority: %d, queue size: %d)", task.ID, task.Priority, len(ptq.tasks))
-}
-
-// Pop removes and returns the highest priority task
-func (ptq *PriorityTaskQueue) Pop() *types.Task {
- ptq.mutex.Lock()
- defer ptq.mutex.Unlock()
-
- if len(ptq.tasks) == 0 {
- return nil
- }
-
- task := ptq.tasks[0]
- ptq.tasks = ptq.tasks[1:]
- return task
-}
-
-// Peek returns the highest priority task without removing it
-func (ptq *PriorityTaskQueue) Peek() *types.Task {
- ptq.mutex.RLock()
- defer ptq.mutex.RUnlock()
-
- if len(ptq.tasks) == 0 {
- return nil
- }
-
- return ptq.tasks[0]
-}
-
-// IsEmpty returns true if the queue is empty
-func (ptq *PriorityTaskQueue) IsEmpty() bool {
- ptq.mutex.RLock()
- defer ptq.mutex.RUnlock()
-
- return len(ptq.tasks) == 0
-}
-
-// Size returns the number of tasks in the queue
-func (ptq *PriorityTaskQueue) Size() int {
- ptq.mutex.RLock()
- defer ptq.mutex.RUnlock()
-
- return len(ptq.tasks)
-}
-
-// HasTask checks if a task exists for a volume and task type
-func (ptq *PriorityTaskQueue) HasTask(volumeID uint32, taskType types.TaskType) bool {
- ptq.mutex.RLock()
- defer ptq.mutex.RUnlock()
-
- for _, task := range ptq.tasks {
- if task.VolumeID == volumeID && task.Type == taskType {
- return true
- }
- }
- return false
-}
-
-// GetTasks returns a copy of all tasks in the queue
-func (ptq *PriorityTaskQueue) GetTasks() []*types.Task {
- ptq.mutex.RLock()
- defer ptq.mutex.RUnlock()
-
- tasksCopy := make([]*types.Task, len(ptq.tasks))
- copy(tasksCopy, ptq.tasks)
- return tasksCopy
-}
-
-// RemoveTask removes a specific task from the queue
-func (ptq *PriorityTaskQueue) RemoveTask(taskID string) bool {
- ptq.mutex.Lock()
- defer ptq.mutex.Unlock()
-
- for i, task := range ptq.tasks {
- if task.ID == taskID {
- ptq.tasks = append(ptq.tasks[:i], ptq.tasks[i+1:]...)
- glog.V(3).Infof("Removed task %s from queue", taskID)
- return true
- }
- }
- return false
-}
-
-// Clear removes all tasks from the queue
-func (ptq *PriorityTaskQueue) Clear() {
- ptq.mutex.Lock()
- defer ptq.mutex.Unlock()
-
- ptq.tasks = ptq.tasks[:0]
- glog.V(3).Infof("Cleared task queue")
-}
diff --git a/weed/admin/task/task_types.go b/weed/admin/task/task_types.go
deleted file mode 100644
index bfe507c7d..000000000
--- a/weed/admin/task/task_types.go
+++ /dev/null
@@ -1,68 +0,0 @@
-package task
-
-import (
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// InProgressTask represents a task currently being executed
-type InProgressTask struct {
- Task *types.Task
- WorkerID string
- StartedAt time.Time
- LastUpdate time.Time
- Progress float64
- EstimatedEnd time.Time
- VolumeReserved bool // Reserved for capacity planning
-}
-
-// VolumeCandidate represents a volume that needs maintenance
-type VolumeCandidate struct {
- VolumeID uint32
- Server string
- Collection string
- TaskType types.TaskType
- Priority types.TaskPriority
- Reason string
- DetectedAt time.Time
- ScheduleAt time.Time
- Parameters map[string]interface{}
-}
-
-// VolumeChange represents a volume state change
-type VolumeChange struct {
- VolumeID uint32
- ChangeType ChangeType
- OldCapacity int64
- NewCapacity int64
- TaskID string
- CompletedAt time.Time
- ReportedToMaster bool
-}
-
-// ChangeType represents the type of volume change
-type ChangeType string
-
-const (
- ChangeTypeECEncoding ChangeType = "ec_encoding"
- ChangeTypeVacuumComplete ChangeType = "vacuum_completed"
-)
-
-// WorkerMetrics represents performance metrics for a worker
-type WorkerMetrics struct {
- TasksCompleted int
- TasksFailed int
- AverageTaskTime time.Duration
- LastTaskTime time.Time
- SuccessRate float64
-}
-
-// VolumeReservation represents a reserved volume capacity
-type VolumeReservation struct {
- VolumeID uint32
- TaskID string
- ReservedAt time.Time
- ExpectedEnd time.Time
- CapacityDelta int64 // Expected change in capacity
-}
diff --git a/weed/admin/task/volume_state_manager.go b/weed/admin/task/volume_state_manager.go
deleted file mode 100644
index a0058096f..000000000
--- a/weed/admin/task/volume_state_manager.go
+++ /dev/null
@@ -1,640 +0,0 @@
-package task
-
-import (
- "context"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// VolumeStateManager provides comprehensive tracking of all volume and shard states
-type VolumeStateManager struct {
- masterClient *wdclient.MasterClient
- volumes map[uint32]*VolumeState
- ecShards map[uint32]*ECShardState // Key: VolumeID
- inProgressTasks map[string]*TaskImpact // Key: TaskID
- plannedOperations map[string]*PlannedOperation // Key: OperationID
- capacityCache map[string]*CapacityInfo // Key: Server address
- lastMasterSync time.Time
- mutex sync.RWMutex
-}
-
-// VolumeState tracks comprehensive state of a volume
-type VolumeState struct {
- VolumeID uint32
- CurrentState *VolumeInfo // Current state from master
- InProgressTasks []*TaskImpact // Tasks currently affecting this volume
- PlannedChanges []*PlannedOperation // Future operations planned
- PredictedState *VolumeInfo // Predicted state after all operations
- LastMasterUpdate time.Time
- Inconsistencies []StateInconsistency
-}
-
-// ECShardState tracks EC shard information
-type ECShardState struct {
- VolumeID uint32
- CurrentShards map[int]*ShardInfo // Current shards from master (0-13)
- InProgressTasks []*TaskImpact // Tasks affecting shards
- PlannedShards map[int]*PlannedShard // Planned shard operations
- PredictedShards map[int]*ShardInfo // Predicted final state
- LastUpdate time.Time
-}
-
-// ShardInfo represents information about an EC shard
-type ShardInfo struct {
- ShardID int
- Server string
- Size uint64
- Status ShardStatus
- LastUpdate time.Time
-}
-
-// ShardStatus represents the status of a shard
-type ShardStatus string
-
-const (
- ShardStatusExists ShardStatus = "exists"
- ShardStatusCreating ShardStatus = "creating"
- ShardStatusDeleting ShardStatus = "deleting"
- ShardStatusMissing ShardStatus = "missing"
- ShardStatusCorrupted ShardStatus = "corrupted"
-)
-
-// TaskImpact describes how a task affects volume/shard state
-type TaskImpact struct {
- TaskID string
- TaskType types.TaskType
- VolumeID uint32
- WorkerID string
- StartedAt time.Time
- EstimatedEnd time.Time
-
- // Volume impacts
- VolumeChanges *VolumeChanges
-
- // Shard impacts
- ShardChanges map[int]*ShardChange // Key: ShardID
-
- // Capacity impacts
- CapacityDelta map[string]int64 // Key: Server, Value: capacity change
-}
-
-// VolumeChanges describes changes to a volume
-type VolumeChanges struct {
- SizeChange int64
- WillBeDeleted bool
- WillBeCreated bool
- WillBecomeReadOnly bool
- CollectionChange string
- DiskTypeChange string
-}
-
-// ShardChange describes changes to a shard
-type ShardChange struct {
- ShardID int
- WillBeCreated bool
- WillBeDeleted bool
- TargetServer string
- SizeChange int64
-}
-
-// PlannedOperation represents a future operation
-type PlannedOperation struct {
- OperationID string
- Type OperationType
- VolumeID uint32
- ScheduledAt time.Time
- Priority types.TaskPriority
- Prerequisites []string // Other operation IDs that must complete first
- Impact *TaskImpact
-}
-
-// OperationType represents different types of planned operations
-type OperationType string
-
-const (
- OperationECEncode OperationType = "ec_encode"
- OperationECRebuild OperationType = "ec_rebuild"
- OperationECBalance OperationType = "ec_balance"
- OperationVacuum OperationType = "vacuum"
- OperationVolumeMove OperationType = "volume_move"
- OperationShardMove OperationType = "shard_move"
- OperationVolumeDelete OperationType = "volume_delete"
-)
-
-// CapacityInfo tracks server capacity information
-type CapacityInfo struct {
- Server string
- TotalCapacity int64
- UsedCapacity int64
- ReservedCapacity int64 // Capacity reserved for in-progress tasks
- PredictedUsage int64 // Predicted usage after all operations
- LastUpdate time.Time
-}
-
-// StateInconsistency represents detected inconsistencies
-type StateInconsistency struct {
- Type InconsistencyType
- Description string
- DetectedAt time.Time
- Severity SeverityLevel
- VolumeID uint32
- ShardID *int
-}
-
-// InconsistencyType represents different types of state inconsistencies
-type InconsistencyType string
-
-const (
- InconsistencyVolumeMissing InconsistencyType = "volume_missing"
- InconsistencyVolumeUnexpected InconsistencyType = "volume_unexpected"
- InconsistencyShardMissing InconsistencyType = "shard_missing"
- InconsistencyShardUnexpected InconsistencyType = "shard_unexpected"
- InconsistencyCapacityMismatch InconsistencyType = "capacity_mismatch"
- InconsistencyTaskOrphaned InconsistencyType = "task_orphaned"
- InconsistencyDuplicateTask InconsistencyType = "duplicate_task"
-)
-
-// SeverityLevel represents the severity of an inconsistency
-type SeverityLevel string
-
-const (
- SeverityLow SeverityLevel = "low"
- SeverityMedium SeverityLevel = "medium"
- SeverityHigh SeverityLevel = "high"
- SeverityCritical SeverityLevel = "critical"
-)
-
-// NewVolumeStateManager creates a new volume state manager
-func NewVolumeStateManager(masterClient *wdclient.MasterClient) *VolumeStateManager {
- return &VolumeStateManager{
- masterClient: masterClient,
- volumes: make(map[uint32]*VolumeState),
- ecShards: make(map[uint32]*ECShardState),
- inProgressTasks: make(map[string]*TaskImpact),
- plannedOperations: make(map[string]*PlannedOperation),
- capacityCache: make(map[string]*CapacityInfo),
- }
-}
-
-// SyncWithMaster synchronizes state with the master server
-func (vsm *VolumeStateManager) SyncWithMaster() error {
- vsm.mutex.Lock()
- defer vsm.mutex.Unlock()
-
- glog.V(2).Infof("Syncing volume state with master")
-
- // Get current volume list from master
- masterVolumes, masterShards, err := vsm.fetchMasterState()
- if err != nil {
- return err
- }
-
- // Update volume states
- vsm.updateVolumeStates(masterVolumes)
-
- // Update shard states
- vsm.updateShardStates(masterShards)
-
- // Detect inconsistencies
- vsm.detectInconsistencies()
-
- // Update capacity information
- vsm.updateCapacityInfo()
-
- // Recalculate predicted states
- vsm.recalculatePredictedStates()
-
- vsm.lastMasterSync = time.Now()
- glog.V(2).Infof("Master sync completed, tracking %d volumes, %d EC volumes",
- len(vsm.volumes), len(vsm.ecShards))
-
- return nil
-}
-
-// RegisterTaskImpact registers the impact of a new task
-func (vsm *VolumeStateManager) RegisterTaskImpact(taskID string, impact *TaskImpact) {
- vsm.mutex.Lock()
- defer vsm.mutex.Unlock()
-
- vsm.inProgressTasks[taskID] = impact
-
- // Update volume state
- if volumeState, exists := vsm.volumes[impact.VolumeID]; exists {
- volumeState.InProgressTasks = append(volumeState.InProgressTasks, impact)
- }
-
- // Update shard state for EC operations
- if impact.TaskType == types.TaskTypeErasureCoding {
- if shardState, exists := vsm.ecShards[impact.VolumeID]; exists {
- shardState.InProgressTasks = append(shardState.InProgressTasks, impact)
- }
- }
-
- // Update capacity reservations
- for server, capacityDelta := range impact.CapacityDelta {
- if capacity, exists := vsm.capacityCache[server]; exists {
- capacity.ReservedCapacity += capacityDelta
- }
- }
-
- // Recalculate predicted states
- vsm.recalculatePredictedStates()
-
- glog.V(2).Infof("Registered task impact: %s for volume %d", taskID, impact.VolumeID)
-}
-
-// UnregisterTaskImpact removes a completed task's impact
-func (vsm *VolumeStateManager) UnregisterTaskImpact(taskID string) {
- vsm.mutex.Lock()
- defer vsm.mutex.Unlock()
-
- impact, exists := vsm.inProgressTasks[taskID]
- if !exists {
- return
- }
-
- delete(vsm.inProgressTasks, taskID)
-
- // Remove from volume state
- if volumeState, exists := vsm.volumes[impact.VolumeID]; exists {
- vsm.removeTaskFromVolume(volumeState, taskID)
- }
-
- // Remove from shard state
- if shardState, exists := vsm.ecShards[impact.VolumeID]; exists {
- vsm.removeTaskFromShards(shardState, taskID)
- }
-
- // Update capacity reservations
- for server, capacityDelta := range impact.CapacityDelta {
- if capacity, exists := vsm.capacityCache[server]; exists {
- capacity.ReservedCapacity -= capacityDelta
- }
- }
-
- // Recalculate predicted states
- vsm.recalculatePredictedStates()
-
- glog.V(2).Infof("Unregistered task impact: %s", taskID)
-}
-
-// GetAccurateCapacity returns accurate capacity information for a server
-func (vsm *VolumeStateManager) GetAccurateCapacity(server string) *CapacityInfo {
- vsm.mutex.RLock()
- defer vsm.mutex.RUnlock()
-
- if capacity, exists := vsm.capacityCache[server]; exists {
- // Return a copy to avoid external modifications
- return &CapacityInfo{
- Server: capacity.Server,
- TotalCapacity: capacity.TotalCapacity,
- UsedCapacity: capacity.UsedCapacity,
- ReservedCapacity: capacity.ReservedCapacity,
- PredictedUsage: capacity.PredictedUsage,
- LastUpdate: capacity.LastUpdate,
- }
- }
- return nil
-}
-
-// GetVolumeState returns the current state of a volume
-func (vsm *VolumeStateManager) GetVolumeState(volumeID uint32) *VolumeState {
- vsm.mutex.RLock()
- defer vsm.mutex.RUnlock()
-
- if state, exists := vsm.volumes[volumeID]; exists {
- // Return a copy to avoid external modifications
- return vsm.copyVolumeState(state)
- }
- return nil
-}
-
-// GetECShardState returns the current state of EC shards for a volume
-func (vsm *VolumeStateManager) GetECShardState(volumeID uint32) *ECShardState {
- vsm.mutex.RLock()
- defer vsm.mutex.RUnlock()
-
- if state, exists := vsm.ecShards[volumeID]; exists {
- return vsm.copyECShardState(state)
- }
- return nil
-}
-
-// CanAssignVolumeToServer checks if a volume can be assigned to a server
-func (vsm *VolumeStateManager) CanAssignVolumeToServer(volumeSize int64, server string) bool {
- vsm.mutex.RLock()
- defer vsm.mutex.RUnlock()
-
- capacity := vsm.capacityCache[server]
- if capacity == nil {
- return false
- }
-
- // Calculate available capacity: Total - Used - Reserved
- availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity
- return availableCapacity >= volumeSize
-}
-
-// PlanOperation schedules a future operation
-func (vsm *VolumeStateManager) PlanOperation(operation *PlannedOperation) {
- vsm.mutex.Lock()
- defer vsm.mutex.Unlock()
-
- vsm.plannedOperations[operation.OperationID] = operation
-
- // Add to volume planned changes
- if volumeState, exists := vsm.volumes[operation.VolumeID]; exists {
- volumeState.PlannedChanges = append(volumeState.PlannedChanges, operation)
- }
-
- glog.V(2).Infof("Planned operation: %s for volume %d", operation.OperationID, operation.VolumeID)
-}
-
-// GetPendingChange returns pending change for a volume
-func (vsm *VolumeStateManager) GetPendingChange(volumeID uint32) *VolumeChange {
- vsm.mutex.RLock()
- defer vsm.mutex.RUnlock()
-
- // Look for pending changes in volume state
- if volumeState, exists := vsm.volumes[volumeID]; exists {
- // Return the most recent pending change
- if len(volumeState.PlannedChanges) > 0 {
- latestOp := volumeState.PlannedChanges[len(volumeState.PlannedChanges)-1]
- if latestOp.Impact != nil && latestOp.Impact.VolumeChanges != nil {
- return &VolumeChange{
- VolumeID: volumeID,
- ChangeType: ChangeType(latestOp.Type),
- OldCapacity: int64(volumeState.CurrentState.Size),
- NewCapacity: int64(volumeState.CurrentState.Size) + latestOp.Impact.VolumeChanges.SizeChange,
- TaskID: latestOp.Impact.TaskID,
- CompletedAt: time.Time{}, // Not completed yet
- ReportedToMaster: false,
- }
- }
- }
- }
-
- return nil
-}
-
-// fetchMasterState retrieves current state from master
-func (vsm *VolumeStateManager) fetchMasterState() (map[uint32]*VolumeInfo, map[uint32]map[int]*ShardInfo, error) {
- volumes := make(map[uint32]*VolumeInfo)
- shards := make(map[uint32]map[int]*ShardInfo)
-
- err := vsm.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
- // Fetch volume list
- resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
- if err != nil {
- return err
- }
-
- // Process topology info
- if resp.TopologyInfo != nil {
- for _, dc := range resp.TopologyInfo.DataCenterInfos {
- for _, rack := range dc.RackInfos {
- for _, node := range rack.DataNodeInfos {
- for _, diskInfo := range node.DiskInfos {
- // Process regular volumes
- for _, volInfo := range diskInfo.VolumeInfos {
- volumes[volInfo.Id] = &VolumeInfo{
- ID: volInfo.Id,
- Size: volInfo.Size,
- Collection: volInfo.Collection,
- FileCount: volInfo.FileCount,
- DeleteCount: volInfo.DeleteCount,
- DeletedByteCount: volInfo.DeletedByteCount,
- ReadOnly: volInfo.ReadOnly,
- Server: node.Id,
- DataCenter: dc.Id,
- Rack: rack.Id,
- DiskType: volInfo.DiskType,
- ModifiedAtSecond: volInfo.ModifiedAtSecond,
- RemoteStorageKey: volInfo.RemoteStorageKey,
- }
- }
-
- // Process EC shards
- for _, ecShardInfo := range diskInfo.EcShardInfos {
- volumeID := ecShardInfo.Id
- if shards[volumeID] == nil {
- shards[volumeID] = make(map[int]*ShardInfo)
- }
-
- // Decode shard bits
- for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ {
- if (ecShardInfo.EcIndexBits & (1 << uint(shardID))) != 0 {
- shards[volumeID][shardID] = &ShardInfo{
- ShardID: shardID,
- Server: node.Id,
- Size: 0, // Size would need to be fetched separately
- Status: ShardStatusExists,
- LastUpdate: time.Now(),
- }
- }
- }
- }
- }
- }
- }
- }
- }
-
- return nil
- })
-
- return volumes, shards, err
-}
-
-// updateVolumeStates updates volume states based on master data
-func (vsm *VolumeStateManager) updateVolumeStates(masterVolumes map[uint32]*VolumeInfo) {
- now := time.Now()
-
- // Update existing volumes and add new ones
- for volumeID, masterVolume := range masterVolumes {
- if volumeState, exists := vsm.volumes[volumeID]; exists {
- // Update existing volume
- oldState := volumeState.CurrentState
- volumeState.CurrentState = masterVolume
- volumeState.LastMasterUpdate = now
-
- // Check for unexpected changes
- if oldState != nil && vsm.hasUnexpectedChanges(oldState, masterVolume) {
- vsm.addInconsistency(volumeState, InconsistencyVolumeUnexpected,
- "Volume changed unexpectedly since last sync", SeverityMedium)
- }
- } else {
- // New volume detected
- vsm.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: masterVolume,
- InProgressTasks: []*TaskImpact{},
- PlannedChanges: []*PlannedOperation{},
- LastMasterUpdate: now,
- Inconsistencies: []StateInconsistency{},
- }
- }
- }
-
- // Detect missing volumes (volumes we knew about but master doesn't report)
- for volumeID, volumeState := range vsm.volumes {
- if _, existsInMaster := masterVolumes[volumeID]; !existsInMaster {
- // Check if this is expected (due to deletion task)
- if !vsm.isVolumeDeletionExpected(volumeID) {
- vsm.addInconsistency(volumeState, InconsistencyVolumeMissing,
- "Volume missing from master but not expected to be deleted", SeverityHigh)
- }
- }
- }
-}
-
-// updateShardStates updates EC shard states
-func (vsm *VolumeStateManager) updateShardStates(masterShards map[uint32]map[int]*ShardInfo) {
- now := time.Now()
-
- // Update existing shard states
- for volumeID, shardMap := range masterShards {
- if shardState, exists := vsm.ecShards[volumeID]; exists {
- shardState.CurrentShards = shardMap
- shardState.LastUpdate = now
- } else {
- vsm.ecShards[volumeID] = &ECShardState{
- VolumeID: volumeID,
- CurrentShards: shardMap,
- InProgressTasks: []*TaskImpact{},
- PlannedShards: make(map[int]*PlannedShard),
- PredictedShards: make(map[int]*ShardInfo),
- LastUpdate: now,
- }
- }
- }
-
- // Check for missing shards that we expected to exist
- for volumeID, shardState := range vsm.ecShards {
- if masterShardMap, exists := masterShards[volumeID]; exists {
- vsm.validateShardConsistency(shardState, masterShardMap)
- }
- }
-}
-
-// detectInconsistencies identifies state inconsistencies
-func (vsm *VolumeStateManager) detectInconsistencies() {
- for _, volumeState := range vsm.volumes {
- vsm.detectVolumeInconsistencies(volumeState)
- }
-
- for _, shardState := range vsm.ecShards {
- vsm.detectShardInconsistencies(shardState)
- }
-
- vsm.detectOrphanedTasks()
- vsm.detectDuplicateTasks()
- vsm.detectCapacityInconsistencies()
-}
-
-// updateCapacityInfo updates server capacity information
-func (vsm *VolumeStateManager) updateCapacityInfo() {
- for server := range vsm.capacityCache {
- vsm.recalculateServerCapacity(server)
- }
-}
-
-// recalculatePredictedStates recalculates predicted states after all operations
-func (vsm *VolumeStateManager) recalculatePredictedStates() {
- for _, volumeState := range vsm.volumes {
- vsm.calculatePredictedVolumeState(volumeState)
- }
-
- for _, shardState := range vsm.ecShards {
- vsm.calculatePredictedShardState(shardState)
- }
-}
-
-// Helper methods (simplified implementations)
-
-func (vsm *VolumeStateManager) hasUnexpectedChanges(old, new *VolumeInfo) bool {
- return old.Size != new.Size || old.ReadOnly != new.ReadOnly
-}
-
-func (vsm *VolumeStateManager) isVolumeDeletionExpected(volumeID uint32) bool {
- for _, impact := range vsm.inProgressTasks {
- if impact.VolumeID == volumeID && impact.VolumeChanges != nil && impact.VolumeChanges.WillBeDeleted {
- return true
- }
- }
- return false
-}
-
-func (vsm *VolumeStateManager) addInconsistency(volumeState *VolumeState, incType InconsistencyType, desc string, severity SeverityLevel) {
- inconsistency := StateInconsistency{
- Type: incType,
- Description: desc,
- DetectedAt: time.Now(),
- Severity: severity,
- VolumeID: volumeState.VolumeID,
- }
- volumeState.Inconsistencies = append(volumeState.Inconsistencies, inconsistency)
-
- glog.Warningf("State inconsistency detected for volume %d: %s", volumeState.VolumeID, desc)
-}
-
-func (vsm *VolumeStateManager) removeTaskFromVolume(volumeState *VolumeState, taskID string) {
- for i, task := range volumeState.InProgressTasks {
- if task.TaskID == taskID {
- volumeState.InProgressTasks = append(volumeState.InProgressTasks[:i], volumeState.InProgressTasks[i+1:]...)
- break
- }
- }
-}
-
-func (vsm *VolumeStateManager) removeTaskFromShards(shardState *ECShardState, taskID string) {
- for i, task := range shardState.InProgressTasks {
- if task.TaskID == taskID {
- shardState.InProgressTasks = append(shardState.InProgressTasks[:i], shardState.InProgressTasks[i+1:]...)
- break
- }
- }
-}
-
-func (vsm *VolumeStateManager) copyVolumeState(state *VolumeState) *VolumeState {
- // Return a deep copy (implementation would be more detailed)
- return &VolumeState{
- VolumeID: state.VolumeID,
- CurrentState: state.CurrentState,
- LastMasterUpdate: state.LastMasterUpdate,
- }
-}
-
-func (vsm *VolumeStateManager) copyECShardState(state *ECShardState) *ECShardState {
- // Return a deep copy (implementation would be more detailed)
- return &ECShardState{
- VolumeID: state.VolumeID,
- LastUpdate: state.LastUpdate,
- }
-}
-
-// Placeholder implementations for consistency checking methods
-func (vsm *VolumeStateManager) validateShardConsistency(shardState *ECShardState, masterShards map[int]*ShardInfo) {
-}
-func (vsm *VolumeStateManager) detectVolumeInconsistencies(volumeState *VolumeState) {}
-func (vsm *VolumeStateManager) detectShardInconsistencies(shardState *ECShardState) {}
-func (vsm *VolumeStateManager) detectOrphanedTasks() {}
-func (vsm *VolumeStateManager) detectDuplicateTasks() {}
-func (vsm *VolumeStateManager) detectCapacityInconsistencies() {}
-func (vsm *VolumeStateManager) recalculateServerCapacity(server string) {}
-func (vsm *VolumeStateManager) calculatePredictedVolumeState(volumeState *VolumeState) {}
-func (vsm *VolumeStateManager) calculatePredictedShardState(shardState *ECShardState) {}
-
-// PlannedShard represents a planned shard operation
-type PlannedShard struct {
- ShardID int
- Operation string // "create", "delete", "move"
- TargetServer string
- ScheduledAt time.Time
-}
diff --git a/weed/admin/task/volume_state_manager_test.go b/weed/admin/task/volume_state_manager_test.go
deleted file mode 100644
index 1f98cf97a..000000000
--- a/weed/admin/task/volume_state_manager_test.go
+++ /dev/null
@@ -1,440 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-func TestVolumeStateManager_RegisterTaskImpact(t *testing.T) {
- vsm := NewVolumeStateManager(nil)
-
- // Create test volume state
- volumeID := uint32(1)
- volumeState := &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 1024 * 1024 * 1024, // 1GB
- },
- InProgressTasks: []*TaskImpact{},
- PlannedChanges: []*PlannedOperation{},
- Inconsistencies: []StateInconsistency{},
- }
- vsm.volumes[volumeID] = volumeState
-
- // Create task impact
- impact := &TaskImpact{
- TaskID: "test_task_1",
- TaskType: types.TaskTypeErasureCoding,
- VolumeID: volumeID,
- WorkerID: "worker_1",
- StartedAt: time.Now(),
- EstimatedEnd: time.Now().Add(15 * time.Minute),
- VolumeChanges: &VolumeChanges{
- WillBecomeReadOnly: true,
- },
- ShardChanges: make(map[int]*ShardChange),
- CapacityDelta: map[string]int64{"server1": 400 * 1024 * 1024}, // 400MB for shards
- }
-
- // Register impact
- vsm.RegisterTaskImpact(impact.TaskID, impact)
-
- // Verify impact was registered
- if len(vsm.inProgressTasks) != 1 {
- t.Errorf("Expected 1 in-progress task, got %d", len(vsm.inProgressTasks))
- }
-
- if len(volumeState.InProgressTasks) != 1 {
- t.Errorf("Expected 1 task in volume state, got %d", len(volumeState.InProgressTasks))
- }
-
- // Verify task can be retrieved
- retrievedImpact := vsm.inProgressTasks[impact.TaskID]
- if retrievedImpact == nil {
- t.Error("Task impact not found after registration")
- }
-
- if retrievedImpact.TaskType != types.TaskTypeErasureCoding {
- t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, retrievedImpact.TaskType)
- }
-}
-
-func TestVolumeStateManager_UnregisterTaskImpact(t *testing.T) {
- vsm := NewVolumeStateManager(nil)
-
- // Setup test data
- volumeID := uint32(1)
- taskID := "test_task_1"
-
- volumeState := &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
- InProgressTasks: []*TaskImpact{},
- }
- vsm.volumes[volumeID] = volumeState
-
- impact := &TaskImpact{
- TaskID: taskID,
- TaskType: types.TaskTypeVacuum,
- VolumeID: volumeID,
- CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
- }
-
- // Register then unregister
- vsm.RegisterTaskImpact(taskID, impact)
- vsm.UnregisterTaskImpact(taskID)
-
- // Verify impact was removed
- if len(vsm.inProgressTasks) != 0 {
- t.Errorf("Expected 0 in-progress tasks, got %d", len(vsm.inProgressTasks))
- }
-
- if len(volumeState.InProgressTasks) != 0 {
- t.Errorf("Expected 0 tasks in volume state, got %d", len(volumeState.InProgressTasks))
- }
-}
-
-func TestVolumeStateManager_CanAssignVolumeToServer(t *testing.T) {
- vsm := NewVolumeStateManager(nil)
-
- // Setup server capacity
- serverID := "test_server"
- capacity := &CapacityInfo{
- Server: serverID,
- TotalCapacity: 10 * 1024 * 1024 * 1024, // 10GB
- UsedCapacity: 3 * 1024 * 1024 * 1024, // 3GB used
- ReservedCapacity: 1 * 1024 * 1024 * 1024, // 1GB reserved
- PredictedUsage: 4 * 1024 * 1024 * 1024, // 4GB predicted total
- }
- vsm.capacityCache[serverID] = capacity
-
- tests := []struct {
- name string
- volumeSize int64
- expected bool
- desc string
- }{
- {
- name: "Small volume fits",
- volumeSize: 1 * 1024 * 1024 * 1024, // 1GB
- expected: true,
- desc: "1GB volume should fit in 6GB available space",
- },
- {
- name: "Large volume fits exactly",
- volumeSize: 6 * 1024 * 1024 * 1024, // 6GB
- expected: true,
- desc: "6GB volume should fit exactly in available space",
- },
- {
- name: "Volume too large",
- volumeSize: 7 * 1024 * 1024 * 1024, // 7GB
- expected: false,
- desc: "7GB volume should not fit in 6GB available space",
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result := vsm.CanAssignVolumeToServer(tt.volumeSize, serverID)
- if result != tt.expected {
- t.Errorf("CanAssignVolumeToServer() = %v, want %v. %s", result, tt.expected, tt.desc)
- }
- })
- }
-}
-
-func TestVolumeStateManager_GetPendingChange(t *testing.T) {
- vsm := NewVolumeStateManager(nil)
-
- volumeID := uint32(1)
-
- // Create volume with planned operation
- volumeState := &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 2 * 1024 * 1024 * 1024, // 2GB
- },
- PlannedChanges: []*PlannedOperation{
- {
- OperationID: "op_1",
- Type: OperationVacuum,
- VolumeID: volumeID,
- Impact: &TaskImpact{
- TaskID: "task_1",
- VolumeChanges: &VolumeChanges{
- SizeChange: -500 * 1024 * 1024, // 500MB reduction
- },
- },
- },
- },
- }
- vsm.volumes[volumeID] = volumeState
-
- // Test getting pending change
- change := vsm.GetPendingChange(volumeID)
-
- if change == nil {
- t.Fatal("Expected pending change, got nil")
- }
-
- if change.VolumeID != volumeID {
- t.Errorf("Expected volume ID %d, got %d", volumeID, change.VolumeID)
- }
-
- expectedNewCapacity := int64(2*1024*1024*1024 - 500*1024*1024) // 2GB - 500MB
- if change.NewCapacity != expectedNewCapacity {
- t.Errorf("Expected new capacity %d, got %d", expectedNewCapacity, change.NewCapacity)
- }
-
- // Test no pending change
- change2 := vsm.GetPendingChange(999) // Non-existent volume
- if change2 != nil {
- t.Error("Expected nil for non-existent volume, got change")
- }
-}
-
-func TestVolumeStateManager_StateConsistency(t *testing.T) {
- // Test that demonstrates the core value: accurate state tracking
- vsm := NewVolumeStateManager(nil)
-
- volumeID := uint32(1)
- serverID := "test_server"
-
- // Setup initial state
- vsm.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{
- ID: volumeID,
- Size: 28 * 1024 * 1024 * 1024, // 28GB - ready for EC
- Server: serverID,
- },
- InProgressTasks: []*TaskImpact{},
- PlannedChanges: []*PlannedOperation{},
- }
-
- vsm.capacityCache[serverID] = &CapacityInfo{
- Server: serverID,
- TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
- UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
- PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
- }
-
- // Step 1: Register EC task impact
- ecImpact := &TaskImpact{
- TaskID: "ec_task_1",
- TaskType: types.TaskTypeErasureCoding,
- VolumeID: volumeID,
- VolumeChanges: &VolumeChanges{
- WillBecomeReadOnly: true,
- },
- CapacityDelta: map[string]int64{
- serverID: 12 * 1024 * 1024 * 1024, // 12GB for EC shards (40% overhead)
- },
- }
-
- vsm.RegisterTaskImpact(ecImpact.TaskID, ecImpact)
-
- // Verify capacity is reserved
- capacity := vsm.GetAccurateCapacity(serverID)
- expectedPredicted := int64(50 * 1024 * 1024 * 1024) // 50GB initially
- if capacity.PredictedUsage != expectedPredicted {
- t.Errorf("Expected predicted usage %d, got %d", expectedPredicted, capacity.PredictedUsage)
- }
-
- // Verify reservation is tracked separately
- expectedReserved := int64(12 * 1024 * 1024 * 1024) // 12GB for EC shards
- if capacity.ReservedCapacity != expectedReserved {
- t.Errorf("Expected reserved capacity %d, got %d", expectedReserved, capacity.ReservedCapacity)
- }
-
- // Calculate available capacity correctly
- availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity
- // 100GB - 50GB - 12GB = 38GB available
- expectedAvailable := int64(38 * 1024 * 1024 * 1024)
- if availableCapacity != expectedAvailable {
- t.Errorf("Expected available capacity %d, got %d", expectedAvailable, availableCapacity)
- }
-
- // Step 2: Check assignment logic - should reject new large volume
- canAssign := vsm.CanAssignVolumeToServer(40*1024*1024*1024, serverID) // 40GB volume
- if canAssign {
- t.Error("Should not be able to assign 40GB volume when only 38GB available after reservations")
- }
-
- // Step 3: Complete EC task
- vsm.UnregisterTaskImpact(ecImpact.TaskID)
-
- // Verify capacity is updated correctly
- capacityAfter := vsm.GetAccurateCapacity(serverID)
- if capacityAfter.ReservedCapacity != 0 {
- t.Errorf("Expected 0 reserved capacity after task completion, got %d", capacityAfter.ReservedCapacity)
- }
-
- t.Logf("✅ State consistency test passed - accurate capacity tracking throughout task lifecycle")
-}
-
-func TestVolumeStateManager_ConcurrentTasks(t *testing.T) {
- // Test multiple concurrent tasks affecting capacity
- vsm := NewVolumeStateManager(nil)
-
- serverID := "test_server"
- vsm.capacityCache[serverID] = &CapacityInfo{
- Server: serverID,
- TotalCapacity: 50 * 1024 * 1024 * 1024, // 50GB
- UsedCapacity: 10 * 1024 * 1024 * 1024, // 10GB used
- PredictedUsage: 10 * 1024 * 1024 * 1024, // Initially 10GB
- }
-
- // Register multiple tasks
- tasks := []struct {
- taskID string
- volumeID uint32
- capacityDelta int64
- }{
- {"ec_task_1", 1, 15 * 1024 * 1024 * 1024}, // 15GB for EC
- {"vacuum_task_1", 2, -5 * 1024 * 1024 * 1024}, // 5GB savings
- {"ec_task_2", 3, 20 * 1024 * 1024 * 1024}, // 20GB for EC
- }
-
- for _, task := range tasks {
- // Setup volume state
- vsm.volumes[task.volumeID] = &VolumeState{
- VolumeID: task.volumeID,
- CurrentState: &VolumeInfo{ID: task.volumeID, Size: 25 * 1024 * 1024 * 1024},
- }
-
- impact := &TaskImpact{
- TaskID: task.taskID,
- VolumeID: task.volumeID,
- TaskType: types.TaskTypeErasureCoding,
- CapacityDelta: map[string]int64{serverID: task.capacityDelta},
- }
-
- vsm.RegisterTaskImpact(task.taskID, impact)
- }
-
- // Check cumulative capacity impact
- capacity := vsm.GetAccurateCapacity(serverID)
- expectedPredicted := int64(10*1024*1024*1024 + 15*1024*1024*1024 - 5*1024*1024*1024 + 20*1024*1024*1024) // 40GB
-
- if capacity.PredictedUsage != expectedPredicted {
- t.Errorf("Expected predicted usage %d GB, got %d GB",
- expectedPredicted/(1024*1024*1024), capacity.PredictedUsage/(1024*1024*1024))
- }
-
- // Verify we can't assign more than available
- remainingCapacity := capacity.TotalCapacity - capacity.PredictedUsage
- canAssign := vsm.CanAssignVolumeToServer(remainingCapacity+1, serverID)
- if canAssign {
- t.Error("Should not be able to assign volume larger than remaining capacity")
- }
-
- t.Logf("✅ Concurrent tasks test passed - accurate cumulative capacity tracking")
-}
-
-func TestVolumeStateManager_ECShardTracking(t *testing.T) {
- vsm := NewVolumeStateManager(nil)
-
- volumeID := uint32(1)
-
- // Create EC shard state
- shardState := &ECShardState{
- VolumeID: volumeID,
- CurrentShards: map[int]*ShardInfo{
- 0: {ShardID: 0, Server: "server1", Status: ShardStatusExists},
- 1: {ShardID: 1, Server: "server1", Status: ShardStatusExists},
- 2: {ShardID: 2, Server: "server2", Status: ShardStatusExists},
- },
- InProgressTasks: []*TaskImpact{},
- PlannedShards: make(map[int]*PlannedShard),
- PredictedShards: make(map[int]*ShardInfo),
- }
- vsm.ecShards[volumeID] = shardState
-
- // Register task that will create more shards
- impact := &TaskImpact{
- TaskID: "ec_expand_task",
- VolumeID: volumeID,
- TaskType: types.TaskTypeErasureCoding,
- ShardChanges: map[int]*ShardChange{
- 3: {ShardID: 3, WillBeCreated: true, TargetServer: "server3"},
- 4: {ShardID: 4, WillBeCreated: true, TargetServer: "server3"},
- },
- }
-
- vsm.RegisterTaskImpact(impact.TaskID, impact)
-
- // Verify shard state tracking
- retrievedState := vsm.GetECShardState(volumeID)
- if retrievedState == nil {
- t.Fatal("Expected EC shard state, got nil")
- }
-
- if len(retrievedState.InProgressTasks) != 1 {
- t.Errorf("Expected 1 in-progress task for shards, got %d", len(retrievedState.InProgressTasks))
- }
-
- // Verify current shards are still tracked
- if len(retrievedState.CurrentShards) != 3 {
- t.Errorf("Expected 3 current shards, got %d", len(retrievedState.CurrentShards))
- }
-
- t.Logf("✅ EC shard tracking test passed")
-}
-
-// Benchmark tests for performance
-func BenchmarkVolumeStateManager_RegisterTaskImpact(b *testing.B) {
- vsm := NewVolumeStateManager(nil)
-
- // Setup test data
- for i := 0; i < 1000; i++ {
- volumeID := uint32(i + 1)
- vsm.volumes[volumeID] = &VolumeState{
- VolumeID: volumeID,
- CurrentState: &VolumeInfo{ID: volumeID},
- InProgressTasks: []*TaskImpact{},
- }
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- impact := &TaskImpact{
- TaskID: generateTaskID(),
- VolumeID: uint32((i % 1000) + 1),
- TaskType: types.TaskTypeVacuum,
- CapacityDelta: map[string]int64{"server1": 1024 * 1024},
- }
-
- vsm.RegisterTaskImpact(impact.TaskID, impact)
- vsm.UnregisterTaskImpact(impact.TaskID)
- }
-}
-
-func BenchmarkVolumeStateManager_CanAssignVolumeToServer(b *testing.B) {
- vsm := NewVolumeStateManager(nil)
-
- // Setup capacity data
- for i := 0; i < 100; i++ {
- serverID := fmt.Sprintf("server_%d", i)
- vsm.capacityCache[serverID] = &CapacityInfo{
- Server: serverID,
- TotalCapacity: 100 * 1024 * 1024 * 1024,
- UsedCapacity: 50 * 1024 * 1024 * 1024,
- PredictedUsage: 50 * 1024 * 1024 * 1024,
- }
- }
-
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- serverID := fmt.Sprintf("server_%d", i%100)
- vsm.CanAssignVolumeToServer(1024*1024*1024, serverID)
- }
-}
diff --git a/weed/admin/task/volume_state_tracker.go b/weed/admin/task/volume_state_tracker.go
deleted file mode 100644
index a51436b83..000000000
--- a/weed/admin/task/volume_state_tracker.go
+++ /dev/null
@@ -1,226 +0,0 @@
-package task
-
-import (
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// VolumeStateTracker tracks volume state changes and reconciles with master
-type VolumeStateTracker struct {
- masterClient *wdclient.MasterClient
- reconcileInterval time.Duration
- reservedVolumes map[uint32]*VolumeReservation
- pendingChanges map[uint32]*VolumeChange
- mutex sync.RWMutex
-}
-
-// NewVolumeStateTracker creates a new volume state tracker
-func NewVolumeStateTracker(masterClient *wdclient.MasterClient, reconcileInterval time.Duration) *VolumeStateTracker {
- return &VolumeStateTracker{
- masterClient: masterClient,
- reconcileInterval: reconcileInterval,
- reservedVolumes: make(map[uint32]*VolumeReservation),
- pendingChanges: make(map[uint32]*VolumeChange),
- }
-}
-
-// ReserveVolume reserves a volume for a task
-func (vst *VolumeStateTracker) ReserveVolume(volumeID uint32, taskID string) {
- vst.mutex.Lock()
- defer vst.mutex.Unlock()
-
- reservation := &VolumeReservation{
- VolumeID: volumeID,
- TaskID: taskID,
- ReservedAt: time.Now(),
- ExpectedEnd: time.Now().Add(15 * time.Minute), // Default 15 min estimate
- CapacityDelta: 0, // Will be updated based on task type
- }
-
- vst.reservedVolumes[volumeID] = reservation
- glog.V(2).Infof("Reserved volume %d for task %s", volumeID, taskID)
-}
-
-// ReleaseVolume releases a volume reservation
-func (vst *VolumeStateTracker) ReleaseVolume(volumeID uint32, taskID string) {
- vst.mutex.Lock()
- defer vst.mutex.Unlock()
-
- if reservation, exists := vst.reservedVolumes[volumeID]; exists {
- if reservation.TaskID == taskID {
- delete(vst.reservedVolumes, volumeID)
- glog.V(2).Infof("Released volume %d reservation for task %s", volumeID, taskID)
- }
- }
-}
-
-// RecordVolumeChange records a completed volume change
-func (vst *VolumeStateTracker) RecordVolumeChange(volumeID uint32, taskType types.TaskType, taskID string) {
- vst.mutex.Lock()
- defer vst.mutex.Unlock()
-
- changeType := ChangeTypeECEncoding
- if taskType == types.TaskTypeVacuum {
- changeType = ChangeTypeVacuumComplete
- }
-
- change := &VolumeChange{
- VolumeID: volumeID,
- ChangeType: changeType,
- TaskID: taskID,
- CompletedAt: time.Now(),
- ReportedToMaster: false,
- }
-
- vst.pendingChanges[volumeID] = change
- glog.V(1).Infof("Recorded volume change for volume %d: %s", volumeID, changeType)
-}
-
-// GetPendingChange returns pending change for a volume
-func (vst *VolumeStateTracker) GetPendingChange(volumeID uint32) *VolumeChange {
- vst.mutex.RLock()
- defer vst.mutex.RUnlock()
-
- return vst.pendingChanges[volumeID]
-}
-
-// GetVolumeReservation returns reservation for a volume
-func (vst *VolumeStateTracker) GetVolumeReservation(volumeID uint32) *VolumeReservation {
- vst.mutex.RLock()
- defer vst.mutex.RUnlock()
-
- return vst.reservedVolumes[volumeID]
-}
-
-// IsVolumeReserved checks if a volume is reserved
-func (vst *VolumeStateTracker) IsVolumeReserved(volumeID uint32) bool {
- vst.mutex.RLock()
- defer vst.mutex.RUnlock()
-
- _, exists := vst.reservedVolumes[volumeID]
- return exists
-}
-
-// ReconcileWithMaster reconciles volume states with master server
-func (vst *VolumeStateTracker) ReconcileWithMaster() {
- vst.mutex.Lock()
- defer vst.mutex.Unlock()
-
- // Report pending changes to master
- for volumeID, change := range vst.pendingChanges {
- if vst.reportChangeToMaster(change) {
- change.ReportedToMaster = true
- delete(vst.pendingChanges, volumeID)
- glog.V(1).Infof("Successfully reported volume change for volume %d to master", volumeID)
- }
- }
-
- // Clean up expired reservations
- vst.cleanupExpiredReservations()
-}
-
-// reportChangeToMaster reports a volume change to the master server
-func (vst *VolumeStateTracker) reportChangeToMaster(change *VolumeChange) bool {
- // Note: In a real implementation, this would make actual API calls to master
- // For now, we'll simulate the reporting
-
- switch change.ChangeType {
- case ChangeTypeECEncoding:
- return vst.reportECCompletion(change)
- case ChangeTypeVacuumComplete:
- return vst.reportVacuumCompletion(change)
- }
-
- return false
-}
-
-// reportECCompletion reports EC completion to master
-func (vst *VolumeStateTracker) reportECCompletion(change *VolumeChange) bool {
- // This would typically trigger the master to:
- // 1. Update volume state to reflect EC encoding
- // 2. Update capacity calculations
- // 3. Redistribute volume assignments
-
- glog.V(2).Infof("Reporting EC completion for volume %d", change.VolumeID)
-
- // Simulate master API call
- err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
- // In real implementation, there would be a specific API call here
- // For now, we simulate success
- return nil
- })
-
- return err == nil
-}
-
-// reportVacuumCompletion reports vacuum completion to master
-func (vst *VolumeStateTracker) reportVacuumCompletion(change *VolumeChange) bool {
- // This would typically trigger the master to:
- // 1. Update volume statistics
- // 2. Update capacity calculations
- // 3. Mark volume as recently vacuumed
-
- glog.V(2).Infof("Reporting vacuum completion for volume %d", change.VolumeID)
-
- // Simulate master API call
- err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
- // In real implementation, there would be a specific API call here
- // For now, we simulate success
- return nil
- })
-
- return err == nil
-}
-
-// cleanupExpiredReservations removes expired volume reservations
-func (vst *VolumeStateTracker) cleanupExpiredReservations() {
- now := time.Now()
-
- for volumeID, reservation := range vst.reservedVolumes {
- if now.After(reservation.ExpectedEnd) {
- delete(vst.reservedVolumes, volumeID)
- glog.Warningf("Cleaned up expired reservation for volume %d (task %s)", volumeID, reservation.TaskID)
- }
- }
-}
-
-// GetAdjustedCapacity returns adjusted capacity considering in-progress tasks
-func (vst *VolumeStateTracker) GetAdjustedCapacity(volumeID uint32, baseCapacity int64) int64 {
- vst.mutex.RLock()
- defer vst.mutex.RUnlock()
-
- // Check for pending changes
- if change := vst.pendingChanges[volumeID]; change != nil {
- return change.NewCapacity
- }
-
- // Check for in-progress reservations
- if reservation := vst.reservedVolumes[volumeID]; reservation != nil {
- return baseCapacity + reservation.CapacityDelta
- }
-
- return baseCapacity
-}
-
-// GetStats returns statistics about volume state tracking
-func (vst *VolumeStateTracker) GetStats() map[string]interface{} {
- vst.mutex.RLock()
- defer vst.mutex.RUnlock()
-
- stats := make(map[string]interface{})
- stats["reserved_volumes"] = len(vst.reservedVolumes)
- stats["pending_changes"] = len(vst.pendingChanges)
-
- changeTypeCounts := make(map[ChangeType]int)
- for _, change := range vst.pendingChanges {
- changeTypeCounts[change.ChangeType]++
- }
- stats["pending_by_type"] = changeTypeCounts
-
- return stats
-}
diff --git a/weed/admin/task/worker_communication.go b/weed/admin/task/worker_communication.go
deleted file mode 100644
index 01484311f..000000000
--- a/weed/admin/task/worker_communication.go
+++ /dev/null
@@ -1,488 +0,0 @@
-package task
-
-import (
- "context"
- "fmt"
- "io"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/pb"
- "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
- "google.golang.org/grpc"
- "google.golang.org/grpc/credentials/insecure"
-)
-
-// WorkerConnection manages the gRPC connection to a single worker
-type WorkerConnection struct {
- workerID string
- address string
- conn *grpc.ClientConn
- client worker_pb.WorkerServiceClient
- stream worker_pb.WorkerService_WorkerStreamClient
- lastSeen time.Time
- mutex sync.RWMutex
- adminServer *AdminServer
- stopCh chan struct{}
- active bool
-}
-
-// WorkerCommunicationManager manages all worker connections
-type WorkerCommunicationManager struct {
- adminServer *AdminServer
- connections map[string]*WorkerConnection
- mutex sync.RWMutex
- stopCh chan struct{}
-}
-
-// NewWorkerCommunicationManager creates a new worker communication manager
-func NewWorkerCommunicationManager(adminServer *AdminServer) *WorkerCommunicationManager {
- return &WorkerCommunicationManager{
- adminServer: adminServer,
- connections: make(map[string]*WorkerConnection),
- stopCh: make(chan struct{}),
- }
-}
-
-// Start starts the worker communication manager
-func (wcm *WorkerCommunicationManager) Start() {
- glog.Infof("Starting worker communication manager")
-
- go wcm.connectionMonitorLoop()
-}
-
-// Stop stops the worker communication manager
-func (wcm *WorkerCommunicationManager) Stop() {
- glog.Infof("Stopping worker communication manager")
-
- close(wcm.stopCh)
-
- wcm.mutex.Lock()
- defer wcm.mutex.Unlock()
-
- for _, conn := range wcm.connections {
- conn.Close()
- }
-}
-
-// EstablishWorkerConnection establishes a connection to a worker
-func (wcm *WorkerCommunicationManager) EstablishWorkerConnection(workerID, address string) error {
- wcm.mutex.Lock()
- defer wcm.mutex.Unlock()
-
- // Check if already connected
- if conn, exists := wcm.connections[workerID]; exists {
- if conn.active {
- return nil // Already connected
- }
- conn.Close() // Close inactive connection
- }
-
- // Create new connection
- conn, err := NewWorkerConnection(workerID, address, wcm.adminServer)
- if err != nil {
- return fmt.Errorf("failed to create worker connection: %v", err)
- }
-
- wcm.connections[workerID] = conn
-
- // Start connection
- go conn.Start()
-
- glog.Infof("Established connection to worker %s at %s", workerID, address)
- return nil
-}
-
-// SendTaskAssignment sends a task assignment to a worker
-func (wcm *WorkerCommunicationManager) SendTaskAssignment(workerID string, task *Task) error {
- wcm.mutex.RLock()
- conn, exists := wcm.connections[workerID]
- wcm.mutex.RUnlock()
-
- if !exists || !conn.active {
- return fmt.Errorf("no active connection to worker %s", workerID)
- }
-
- return conn.SendTaskAssignment(task)
-}
-
-// CancelTask sends a task cancellation to a worker
-func (wcm *WorkerCommunicationManager) CancelTask(workerID, taskID string, reason string) error {
- wcm.mutex.RLock()
- conn, exists := wcm.connections[workerID]
- wcm.mutex.RUnlock()
-
- if !exists || !conn.active {
- return fmt.Errorf("no active connection to worker %s", workerID)
- }
-
- return conn.CancelTask(taskID, reason)
-}
-
-// GetActiveConnections returns the list of active worker connections
-func (wcm *WorkerCommunicationManager) GetActiveConnections() []string {
- wcm.mutex.RLock()
- defer wcm.mutex.RUnlock()
-
- var active []string
- for workerID, conn := range wcm.connections {
- if conn.active {
- active = append(active, workerID)
- }
- }
-
- return active
-}
-
-// connectionMonitorLoop monitors worker connections and cleans up inactive ones
-func (wcm *WorkerCommunicationManager) connectionMonitorLoop() {
- ticker := time.NewTicker(30 * time.Second)
- defer ticker.Stop()
-
- for {
- select {
- case <-ticker.C:
- wcm.cleanupInactiveConnections()
- case <-wcm.stopCh:
- return
- }
- }
-}
-
-// cleanupInactiveConnections removes inactive worker connections
-func (wcm *WorkerCommunicationManager) cleanupInactiveConnections() {
- wcm.mutex.Lock()
- defer wcm.mutex.Unlock()
-
- now := time.Now()
- timeout := 2 * time.Minute
-
- for workerID, conn := range wcm.connections {
- if !conn.active || now.Sub(conn.lastSeen) > timeout {
- glog.Infof("Cleaning up inactive connection to worker %s", workerID)
- conn.Close()
- delete(wcm.connections, workerID)
-
- // Mark worker as inactive in registry
- wcm.adminServer.workerRegistry.MarkWorkerInactive(workerID)
- }
- }
-}
-
-// NewWorkerConnection creates a new worker connection
-func NewWorkerConnection(workerID, address string, adminServer *AdminServer) (*WorkerConnection, error) {
- // Convert address to gRPC address
- grpcAddress := pb.ServerToGrpcAddress(address)
- conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
- if err != nil {
- return nil, fmt.Errorf("failed to connect to worker at %s: %v", address, err)
- }
-
- client := worker_pb.NewWorkerServiceClient(conn)
-
- return &WorkerConnection{
- workerID: workerID,
- address: address,
- conn: conn,
- client: client,
- lastSeen: time.Now(),
- adminServer: adminServer,
- stopCh: make(chan struct{}),
- active: false,
- }, nil
-}
-
-// Start starts the worker connection and message handling
-func (wc *WorkerConnection) Start() {
- defer wc.Close()
-
- ctx := context.Background()
- stream, err := wc.client.WorkerStream(ctx)
- if err != nil {
- glog.Errorf("Failed to create worker stream for %s: %v", wc.workerID, err)
- return
- }
-
- wc.stream = stream
- wc.active = true
-
- glog.Infof("Worker connection %s started", wc.workerID)
-
- // Start message handling goroutines
- go wc.receiveMessages()
-
- // Keep connection alive until stopped
- <-wc.stopCh
-}
-
-// Close closes the worker connection
-func (wc *WorkerConnection) Close() {
- wc.mutex.Lock()
- defer wc.mutex.Unlock()
-
- if !wc.active {
- return
- }
-
- wc.active = false
- close(wc.stopCh)
-
- if wc.stream != nil {
- wc.stream.CloseSend()
- }
-
- if wc.conn != nil {
- wc.conn.Close()
- }
-
- glog.Infof("Worker connection %s closed", wc.workerID)
-}
-
-// receiveMessages handles incoming messages from the worker
-func (wc *WorkerConnection) receiveMessages() {
- for {
- select {
- case <-wc.stopCh:
- return
- default:
- }
-
- msg, err := wc.stream.Recv()
- if err != nil {
- if err == io.EOF {
- glog.Infof("Worker %s closed connection", wc.workerID)
- } else {
- glog.Errorf("Error receiving from worker %s: %v", wc.workerID, err)
- }
- wc.Close()
- return
- }
-
- wc.updateLastSeen()
- // Convert AdminMessage to WorkerMessage for processing
- if workerMsg := convertToWorkerMessage(msg); workerMsg != nil {
- wc.handleMessage(workerMsg)
- }
- }
-}
-
-// updateLastSeen updates the last seen timestamp
-func (wc *WorkerConnection) updateLastSeen() {
- wc.mutex.Lock()
- defer wc.mutex.Unlock()
- wc.lastSeen = time.Now()
-}
-
-// handleMessage processes a message from the worker
-func (wc *WorkerConnection) handleMessage(msg *worker_pb.WorkerMessage) {
- switch message := msg.Message.(type) {
- case *worker_pb.WorkerMessage_Registration:
- registration := message.Registration
- worker := &Worker{
- ID: registration.WorkerId,
- Address: registration.Address,
- Capabilities: registration.Capabilities,
- }
- wc.workerID = worker.ID
- // UpdateWorkerStatus stub
- if wc.adminServer.workerRegistry != nil {
- // wc.adminServer.workerRegistry.UpdateWorkerStatus(worker) // Commented out - method doesn't exist
- }
- glog.Infof("Worker %s registered", worker.ID)
-
- case *worker_pb.WorkerMessage_Heartbeat:
- glog.V(3).Infof("Heartbeat from worker %s", wc.workerID)
-
- case *worker_pb.WorkerMessage_TaskRequest:
- glog.V(2).Infof("Task request from worker %s", wc.workerID)
- // AssignTaskToWorker stub
- // task := wc.adminServer.AssignTaskToWorker(wc.workerID) // Commented out - method doesn't exist
-
- case *worker_pb.WorkerMessage_TaskUpdate:
- update := message.TaskUpdate
- // UpdateTaskProgress stub - fix signature
- wc.adminServer.UpdateTaskProgress(update.TaskId, float64(update.Progress))
-
- case *worker_pb.WorkerMessage_TaskComplete:
- complete := message.TaskComplete
- // CompleteTask stub - fix signature
- wc.adminServer.CompleteTask(complete.TaskId, complete.Success, complete.ErrorMessage)
-
- case *worker_pb.WorkerMessage_Shutdown:
- glog.Infof("Worker %s shutting down", wc.workerID)
- wc.Close()
- }
-}
-
-// SendTaskAssignment sends a task assignment to the worker
-func (wc *WorkerConnection) SendTaskAssignment(task *Task) error {
- return wc.sendTaskAssignment(task)
-}
-
-// sendTaskAssignment sends a task assignment message
-func (wc *WorkerConnection) sendTaskAssignment(task *types.Task) error {
- // Fix type assertions for parameters
- server, _ := task.Parameters["server"].(string)
- collection, _ := task.Parameters["collection"].(string)
-
- // Convert map[string]interface{} to map[string]string
- parameters := make(map[string]string)
- for k, v := range task.Parameters {
- if str, ok := v.(string); ok {
- parameters[k] = str
- } else {
- parameters[k] = fmt.Sprintf("%v", v)
- }
- }
-
- // Add master_client parameter for tasks that need it (especially EC tasks)
- if wc.adminServer.masterClient != nil {
- if currentMaster := wc.adminServer.masterClient.GetMaster(context.Background()); currentMaster != "" {
- parameters["master_client"] = string(currentMaster)
- glog.V(2).Infof("Added master_client parameter to task %s: %s", task.ID, currentMaster)
- } else {
- glog.Warningf("No master address available for task %s", task.ID)
- }
- }
-
- assignment := &worker_pb.TaskAssignment{
- TaskId: task.ID,
- TaskType: string(task.Type),
- Priority: int32(task.Priority),
- CreatedTime: task.CreatedAt.Unix(),
- Params: &worker_pb.TaskParams{
- VolumeId: task.VolumeID,
- Server: server,
- Collection: collection,
- Parameters: parameters,
- },
- Metadata: map[string]string{
- "assigned_at": time.Now().Format(time.RFC3339),
- },
- }
-
- response := &worker_pb.AdminMessage{
- AdminId: wc.adminServer.ID,
- Timestamp: time.Now().Unix(),
- Message: &worker_pb.AdminMessage_TaskAssignment{
- TaskAssignment: assignment,
- },
- }
-
- return wc.sendMessage(response)
-}
-
-// CancelTask sends a task cancellation to the worker
-func (wc *WorkerConnection) CancelTask(taskID, reason string) error {
- cancellation := &worker_pb.TaskCancellation{
- TaskId: taskID,
- Reason: reason,
- Force: false,
- }
-
- response := &worker_pb.AdminMessage{
- AdminId: wc.adminServer.ID,
- Timestamp: time.Now().Unix(),
- Message: &worker_pb.AdminMessage_TaskCancellation{
- TaskCancellation: cancellation,
- },
- }
-
- return wc.sendMessage(response)
-}
-
-// sendMessage sends a message to the worker
-func (wc *WorkerConnection) sendMessage(msg *worker_pb.AdminMessage) error {
- wc.mutex.RLock()
- defer wc.mutex.RUnlock()
-
- if !wc.active || wc.stream == nil {
- return fmt.Errorf("connection to worker %s is not active", wc.workerID)
- }
-
- // The stream expects WorkerMessage from client (admin) to server (worker)
- // Convert AdminMessage to appropriate WorkerMessage format
- workerMsg := &worker_pb.WorkerMessage{
- WorkerId: wc.workerID,
- Timestamp: msg.Timestamp,
- }
-
- // Convert AdminMessage content to WorkerMessage based on message type
- switch adminMsg := msg.Message.(type) {
- case *worker_pb.AdminMessage_TaskAssignment:
- // Task assignments should be sent as notifications to worker
- // Since there's no direct equivalent, we'll create a generic message
- // In a full implementation, this would need proper message type mapping
- _ = adminMsg // Use the variable to avoid unused warning
- workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
- Heartbeat: &worker_pb.WorkerHeartbeat{
- WorkerId: wc.workerID,
- Status: "task_assigned",
- },
- }
- case *worker_pb.AdminMessage_TaskCancellation:
- // Similar conversion for task cancellation
- _ = adminMsg // Use the variable to avoid unused warning
- workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
- Heartbeat: &worker_pb.WorkerHeartbeat{
- WorkerId: wc.workerID,
- Status: "task_cancelled",
- },
- }
- default:
- // For other message types, send a generic heartbeat
- workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
- Heartbeat: &worker_pb.WorkerHeartbeat{
- WorkerId: wc.workerID,
- Status: "admin_message",
- },
- }
- }
-
- return wc.stream.Send(workerMsg)
-}
-
-// Helper functions
-
-// convertCapabilities converts string capabilities to TaskType slice
-func convertCapabilities(capabilities []string) []TaskType {
- var result []TaskType
- for _, cap := range capabilities {
- result = append(result, TaskType(cap))
- }
- return result
-}
-
-// WorkerStatus represents worker status information
-type WorkerStatus struct {
- Status string
- CurrentLoad int
- MaxConcurrent int
- CurrentTasks []string
- TasksCompleted int
- TasksFailed int
- UptimeSeconds int64
- LastSeen time.Time
-}
-
-// TaskProgress represents task progress information
-type TaskProgress struct {
- Progress float64
- Message string
-}
-
-// TaskResult represents task completion result
-type TaskResult struct {
- Success bool
- Error string
- Message string
-}
-
-// convertToWorkerMessage converts AdminMessage to WorkerMessage (stub implementation)
-func convertToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage {
- // This is a stub - in real implementation would need proper conversion
- // For now, return nil to avoid processing
- return nil
-}
diff --git a/weed/admin/task/worker_registry.go b/weed/admin/task/worker_registry.go
deleted file mode 100644
index b535b522c..000000000
--- a/weed/admin/task/worker_registry.go
+++ /dev/null
@@ -1,348 +0,0 @@
-package task
-
-import (
- "fmt"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/glog"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// WorkerRegistry manages worker registration and tracking
-type WorkerRegistry struct {
- workers map[string]*types.Worker
- capabilities map[types.TaskType][]*types.Worker
- metrics map[string]*WorkerMetrics
- issues map[string][]WorkerIssue
- mutex sync.RWMutex
-}
-
-// WorkerIssue represents an issue with a worker
-type WorkerIssue struct {
- Type string
- Timestamp time.Time
- Details string
-}
-
-// NewWorkerRegistry creates a new worker registry
-func NewWorkerRegistry() *WorkerRegistry {
- return &WorkerRegistry{
- workers: make(map[string]*types.Worker),
- capabilities: make(map[types.TaskType][]*types.Worker),
- metrics: make(map[string]*WorkerMetrics),
- issues: make(map[string][]WorkerIssue),
- }
-}
-
-// RegisterWorker registers a new worker
-func (wr *WorkerRegistry) RegisterWorker(worker *types.Worker) error {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- if _, exists := wr.workers[worker.ID]; exists {
- return fmt.Errorf("worker %s already registered", worker.ID)
- }
-
- // Register worker
- wr.workers[worker.ID] = worker
-
- // Initialize metrics
- wr.metrics[worker.ID] = &WorkerMetrics{
- TasksCompleted: 0,
- TasksFailed: 0,
- AverageTaskTime: 0,
- LastTaskTime: time.Time{},
- SuccessRate: 1.0,
- }
-
- // Update capabilities mapping
- wr.updateCapabilitiesMapping()
-
- glog.Infof("Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities)
- return nil
-}
-
-// UnregisterWorker removes a worker
-func (wr *WorkerRegistry) UnregisterWorker(workerID string) error {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- if _, exists := wr.workers[workerID]; !exists {
- return fmt.Errorf("worker %s not found", workerID)
- }
-
- delete(wr.workers, workerID)
- delete(wr.metrics, workerID)
- delete(wr.issues, workerID)
-
- // Update capabilities mapping
- wr.updateCapabilitiesMapping()
-
- glog.Infof("Unregistered worker %s", workerID)
- return nil
-}
-
-// GetWorker returns a worker by ID
-func (wr *WorkerRegistry) GetWorker(workerID string) (*types.Worker, bool) {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- worker, exists := wr.workers[workerID]
- return worker, exists
-}
-
-// GetAvailableWorkers returns workers that are available for new tasks
-func (wr *WorkerRegistry) GetAvailableWorkers() []*types.Worker {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- var available []*types.Worker
- for _, worker := range wr.workers {
- if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent {
- available = append(available, worker)
- }
- }
- return available
-}
-
-// GetWorkersByCapability returns workers that support a specific capability
-func (wr *WorkerRegistry) GetWorkersByCapability(taskType types.TaskType) []*types.Worker {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- return wr.capabilities[taskType]
-}
-
-// UpdateWorkerHeartbeat updates worker heartbeat and status
-func (wr *WorkerRegistry) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- worker, exists := wr.workers[workerID]
- if !exists {
- return fmt.Errorf("worker %s not found", workerID)
- }
-
- // Update worker status
- worker.LastHeartbeat = time.Now()
- worker.Status = status.Status
- worker.CurrentLoad = status.CurrentLoad
-
- glog.V(3).Infof("Updated heartbeat for worker %s, status: %s, load: %d/%d",
- workerID, status.Status, status.CurrentLoad, worker.MaxConcurrent)
- return nil
-}
-
-// GetTimedOutWorkers returns workers that haven't sent heartbeat within timeout
-func (wr *WorkerRegistry) GetTimedOutWorkers(timeout time.Duration) []string {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- var timedOut []string
- cutoff := time.Now().Add(-timeout)
-
- for workerID, worker := range wr.workers {
- if worker.LastHeartbeat.Before(cutoff) {
- timedOut = append(timedOut, workerID)
- }
- }
-
- return timedOut
-}
-
-// MarkWorkerInactive marks a worker as inactive
-func (wr *WorkerRegistry) MarkWorkerInactive(workerID string) {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- if worker, exists := wr.workers[workerID]; exists {
- worker.Status = "inactive"
- worker.CurrentLoad = 0
- }
-}
-
-// RecordWorkerIssue records an issue with a worker
-func (wr *WorkerRegistry) RecordWorkerIssue(workerID string, issueType string) {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- issue := WorkerIssue{
- Type: issueType,
- Timestamp: time.Now(),
- Details: fmt.Sprintf("Worker issue: %s", issueType),
- }
-
- wr.issues[workerID] = append(wr.issues[workerID], issue)
-
- // Limit issue history to last 10 issues
- if len(wr.issues[workerID]) > 10 {
- wr.issues[workerID] = wr.issues[workerID][1:]
- }
-
- glog.Warningf("Recorded issue for worker %s: %s", workerID, issueType)
-}
-
-// GetWorkerMetrics returns metrics for a worker
-func (wr *WorkerRegistry) GetWorkerMetrics(workerID string) *WorkerMetrics {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- return wr.metrics[workerID]
-}
-
-// UpdateWorkerMetrics updates performance metrics for a worker
-func (wr *WorkerRegistry) UpdateWorkerMetrics(workerID string, taskDuration time.Duration, success bool) {
- wr.mutex.Lock()
- defer wr.mutex.Unlock()
-
- metrics, exists := wr.metrics[workerID]
- if !exists {
- return
- }
-
- if success {
- metrics.TasksCompleted++
- } else {
- metrics.TasksFailed++
- }
-
- metrics.LastTaskTime = time.Now()
-
- // Update average task time
- totalTasks := metrics.TasksCompleted + metrics.TasksFailed
- if totalTasks > 0 {
- oldAvg := metrics.AverageTaskTime
- metrics.AverageTaskTime = time.Duration(
- (float64(oldAvg)*float64(totalTasks-1) + float64(taskDuration)) / float64(totalTasks),
- )
- }
-
- // Update success rate
- if totalTasks > 0 {
- metrics.SuccessRate = float64(metrics.TasksCompleted) / float64(totalTasks)
- }
-}
-
-// GetBestWorkerForTask returns the best worker for a specific task type
-func (wr *WorkerRegistry) GetBestWorkerForTask(taskType types.TaskType) *types.Worker {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- candidates := wr.capabilities[taskType]
- if len(candidates) == 0 {
- return nil
- }
-
- var bestWorker *types.Worker
- bestScore := -1.0
-
- for _, worker := range candidates {
- // Skip if not available
- if worker.Status != "active" || worker.CurrentLoad >= worker.MaxConcurrent {
- continue
- }
-
- // Calculate score based on multiple factors
- score := wr.calculateWorkerScore(worker)
- if bestWorker == nil || score > bestScore {
- bestWorker = worker
- bestScore = score
- }
- }
-
- return bestWorker
-}
-
-// calculateWorkerScore calculates a score for worker selection
-func (wr *WorkerRegistry) calculateWorkerScore(worker *types.Worker) float64 {
- metrics := wr.metrics[worker.ID]
- if metrics == nil {
- return 0.5 // Default score for new workers
- }
-
- // Factors for scoring:
- // 1. Available capacity (0.0 to 1.0)
- capacityScore := float64(worker.MaxConcurrent-worker.CurrentLoad) / float64(worker.MaxConcurrent)
-
- // 2. Success rate (0.0 to 1.0)
- successScore := metrics.SuccessRate
-
- // 3. Recent activity bonus (workers that completed tasks recently get slight bonus)
- activityScore := 0.0
- if !metrics.LastTaskTime.IsZero() && time.Since(metrics.LastTaskTime) < time.Hour {
- activityScore = 0.1
- }
-
- // 4. Issue penalty (workers with recent issues get penalty)
- issuePenalty := 0.0
- if issues, exists := wr.issues[worker.ID]; exists {
- recentIssues := 0
- cutoff := time.Now().Add(-time.Hour)
- for _, issue := range issues {
- if issue.Timestamp.After(cutoff) {
- recentIssues++
- }
- }
- issuePenalty = float64(recentIssues) * 0.1
- }
-
- // Weighted average
- score := (capacityScore*0.4 + successScore*0.4 + activityScore) - issuePenalty
-
- if score < 0 {
- score = 0
- }
- if score > 1 {
- score = 1
- }
-
- return score
-}
-
-// updateCapabilitiesMapping rebuilds the capabilities mapping
-func (wr *WorkerRegistry) updateCapabilitiesMapping() {
- // Clear existing mapping
- for taskType := range wr.capabilities {
- wr.capabilities[taskType] = nil
- }
-
- // Rebuild mapping
- for _, worker := range wr.workers {
- for _, capability := range worker.Capabilities {
- wr.capabilities[capability] = append(wr.capabilities[capability], worker)
- }
- }
-}
-
-// GetRegistryStats returns statistics about the registry
-func (wr *WorkerRegistry) GetRegistryStats() map[string]interface{} {
- wr.mutex.RLock()
- defer wr.mutex.RUnlock()
-
- stats := make(map[string]interface{})
- stats["total_workers"] = len(wr.workers)
-
- statusCounts := make(map[string]int)
- capabilityCounts := make(map[types.TaskType]int)
- totalLoad := 0
- maxCapacity := 0
-
- for _, worker := range wr.workers {
- statusCounts[worker.Status]++
- totalLoad += worker.CurrentLoad
- maxCapacity += worker.MaxConcurrent
-
- for _, capability := range worker.Capabilities {
- capabilityCounts[capability]++
- }
- }
-
- stats["by_status"] = statusCounts
- stats["by_capability"] = capabilityCounts
- stats["total_load"] = totalLoad
- stats["max_capacity"] = maxCapacity
- stats["utilization"] = float64(totalLoad) / float64(maxCapacity) * 100.0
-
- return stats
-}
diff --git a/weed/admin/task_minimal/admin_server.go b/weed/admin/task_minimal/admin_server.go
deleted file mode 100644
index f2645f5bc..000000000
--- a/weed/admin/task_minimal/admin_server.go
+++ /dev/null
@@ -1,324 +0,0 @@
-package task
-
-import (
- "fmt"
- "sync"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/wdclient"
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// AdminConfig contains configuration for the admin server
-type AdminConfig struct {
- ScanInterval time.Duration
- WorkerTimeout time.Duration
- TaskTimeout time.Duration
- MaxRetries int
- ReconcileInterval time.Duration
- EnableFailureRecovery bool
- MaxConcurrentTasks int
-}
-
-// AdminServer manages workers and tasks
-type AdminServer struct {
- config *AdminConfig
- masterClient *wdclient.MasterClient
- running bool
- mutex sync.RWMutex
-
- // Task management
- tasks map[string]*types.Task
- taskQueue []*types.Task
- activeTasks map[string]*types.Task
-
- // Worker management
- workers map[string]*types.Worker
- workerStatus map[string]*types.WorkerStatus
-
- // Task history
- taskHistory []TaskHistoryEntry
-}
-
-// TaskHistoryEntry represents a single task history entry
-type TaskHistoryEntry struct {
- TaskID string
- TaskType types.TaskType
- VolumeID uint32
- WorkerID string
- Status types.TaskStatus
- StartedAt time.Time
- CompletedAt time.Time
- Duration time.Duration
- ErrorMessage string
-}
-
-// SystemStats represents system statistics
-type SystemStats struct {
- ActiveTasks int
- QueuedTasks int
- ActiveWorkers int
- TotalTasks int
-}
-
-// NewAdminServer creates a new admin server
-func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer {
- return &AdminServer{
- config: config,
- masterClient: masterClient,
- tasks: make(map[string]*types.Task),
- taskQueue: make([]*types.Task, 0),
- activeTasks: make(map[string]*types.Task),
- workers: make(map[string]*types.Worker),
- workerStatus: make(map[string]*types.WorkerStatus),
- taskHistory: make([]TaskHistoryEntry, 0),
- }
-}
-
-// Start starts the admin server
-func (as *AdminServer) Start() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if as.running {
- return fmt.Errorf("admin server is already running")
- }
-
- as.running = true
- return nil
-}
-
-// Stop stops the admin server
-func (as *AdminServer) Stop() error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- as.running = false
- return nil
-}
-
-// RegisterWorker registers a new worker
-func (as *AdminServer) RegisterWorker(worker *types.Worker) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- as.workers[worker.ID] = worker
- as.workerStatus[worker.ID] = &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- return nil
-}
-
-// QueueTask adds a new task to the task queue
-func (as *AdminServer) QueueTask(task *types.Task) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return fmt.Errorf("admin server is not running")
- }
-
- if task.ID == "" {
- task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
- }
-
- task.Status = types.TaskStatusPending
- task.CreatedAt = time.Now()
-
- as.tasks[task.ID] = task
- as.taskQueue = append(as.taskQueue, task)
-
- return nil
-}
-
-// RequestTask requests a task for a worker
-func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- if !as.running {
- return nil, fmt.Errorf("admin server is not running")
- }
-
- // Check if worker exists
- worker, exists := as.workers[workerID]
- if !exists {
- return nil, fmt.Errorf("worker %s not found", workerID)
- }
-
- // Check if worker has capacity
- status := as.workerStatus[workerID]
- if status.CurrentLoad >= worker.MaxConcurrent {
- return nil, nil // No capacity
- }
-
- // Find a suitable task
- for i, task := range as.taskQueue {
- if task.Status != types.TaskStatusPending {
- continue
- }
-
- // Check if worker can handle this task type
- canHandle := false
- for _, capability := range capabilities {
- if task.Type == capability {
- canHandle = true
- break
- }
- }
-
- if canHandle {
- // Assign task to worker
- task.Status = types.TaskStatusInProgress
- task.WorkerID = workerID
- now := time.Now()
- task.StartedAt = &now
-
- // Move task from queue to active tasks
- as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
- as.activeTasks[task.ID] = task
-
- // Update worker load
- status.CurrentLoad++
-
- return task, nil
- }
- }
-
- return nil, nil // No suitable task found
-}
-
-// UpdateTaskProgress updates task progress
-func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- task.Progress = progress
-
- return nil
-}
-
-// CompleteTask marks a task as completed
-func (as *AdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- task, exists := as.tasks[taskID]
- if !exists {
- return fmt.Errorf("task %s not found", taskID)
- }
-
- // Update task status
- if success {
- task.Status = types.TaskStatusCompleted
- } else {
- task.Status = types.TaskStatusFailed
- task.Error = errorMessage
- }
-
- now := time.Now()
- task.CompletedAt = &now
-
- // Remove from active tasks
- delete(as.activeTasks, taskID)
-
- // Update worker load
- if task.WorkerID != "" {
- if status, exists := as.workerStatus[task.WorkerID]; exists {
- status.CurrentLoad--
- }
- }
-
- // Add to history
- var duration time.Duration
- if task.StartedAt != nil {
- duration = now.Sub(*task.StartedAt)
- }
-
- entry := TaskHistoryEntry{
- TaskID: task.ID,
- TaskType: task.Type,
- VolumeID: task.VolumeID,
- WorkerID: task.WorkerID,
- Status: task.Status,
- StartedAt: *task.StartedAt,
- CompletedAt: now,
- Duration: duration,
- ErrorMessage: errorMessage,
- }
- as.taskHistory = append(as.taskHistory, entry)
-
- return nil
-}
-
-// UpdateWorkerHeartbeat updates worker heartbeat
-func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
- as.mutex.Lock()
- defer as.mutex.Unlock()
-
- worker, exists := as.workers[workerID]
- if !exists {
- return fmt.Errorf("worker %s not found", workerID)
- }
-
- worker.LastHeartbeat = time.Now()
- as.workerStatus[workerID] = status
-
- return nil
-}
-
-// GetSystemStats returns system statistics
-func (as *AdminServer) GetSystemStats() *SystemStats {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- activeWorkers := 0
- for _, status := range as.workerStatus {
- if status.Status == "active" {
- activeWorkers++
- }
- }
-
- return &SystemStats{
- ActiveTasks: len(as.activeTasks),
- QueuedTasks: len(as.taskQueue),
- ActiveWorkers: activeWorkers,
- TotalTasks: len(as.tasks),
- }
-}
-
-// GetQueuedTaskCount returns the number of queued tasks
-func (as *AdminServer) GetQueuedTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.taskQueue)
-}
-
-// GetActiveTaskCount returns the number of active tasks
-func (as *AdminServer) GetActiveTaskCount() int {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
- return len(as.activeTasks)
-}
-
-// GetTaskHistory returns task history
-func (as *AdminServer) GetTaskHistory() []TaskHistoryEntry {
- as.mutex.RLock()
- defer as.mutex.RUnlock()
-
- // Return a copy of the history
- history := make([]TaskHistoryEntry, len(as.taskHistory))
- copy(history, as.taskHistory)
- return history
-}
diff --git a/weed/admin/task_minimal/go.mod b/weed/admin/task_minimal/go.mod
deleted file mode 100644
index 3af5d3746..000000000
--- a/weed/admin/task_minimal/go.mod
+++ /dev/null
@@ -1,3 +0,0 @@
-module task_minimal
-
-go 1.24.1
diff --git a/weed/admin/task_minimal/integration_test.go b/weed/admin/task_minimal/integration_test.go
deleted file mode 100644
index a7859e569..000000000
--- a/weed/admin/task_minimal/integration_test.go
+++ /dev/null
@@ -1,233 +0,0 @@
-package task
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/seaweedfs/seaweedfs/weed/worker/types"
-)
-
-// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
-func TestSimpleIntegration(t *testing.T) {
- t.Logf("Starting simple integration test")
-
- // Step 1: Create a minimal admin server configuration
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- // Step 2: Create admin server with nil master client (for testing)
- adminServer := NewAdminServer(config, nil)
-
- // Step 3: Start admin server
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Step 4: Test worker registration
- t.Logf("Testing worker registration")
-
- worker := &types.Worker{
- ID: "test-worker-1",
- Address: "localhost:9001",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 2,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
- t.Logf("Successfully registered worker %s", worker.ID)
-
- // Step 5: Test task queueing
- t.Logf("Testing task queueing")
-
- task := &types.Task{
- ID: "test-task-1",
- Type: types.TaskTypeVacuum,
- VolumeID: 1001,
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Fatalf("Failed to queue task: %v", err)
- }
- t.Logf("Successfully queued task %s", task.ID)
-
- // Step 6: Test task request by worker
- t.Logf("Testing task request")
-
- assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
- if err != nil {
- t.Fatalf("Failed to request task: %v", err)
- }
-
- if assignedTask != nil {
- t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
-
- // Step 7: Test task progress updates
- t.Logf("Testing task progress updates")
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
- if err != nil {
- t.Errorf("Failed to update task progress: %v", err)
- }
-
- // Step 8: Test task completion
- t.Logf("Testing task completion")
-
- err = adminServer.CompleteTask(assignedTask.ID, true, "")
- if err != nil {
- t.Errorf("Failed to complete task: %v", err)
- }
- t.Logf("Successfully completed task %s", assignedTask.ID)
- } else {
- t.Logf("No task was assigned (queue might be empty)")
- }
-
- // Step 9: Test basic metrics
- t.Logf("Testing basic metrics")
-
- stats := adminServer.GetSystemStats()
- if stats != nil {
- t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
- stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
- }
-
- queuedCount := adminServer.GetQueuedTaskCount()
- activeCount := adminServer.GetActiveTaskCount()
- t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
-
- // Step 10: Test task history
- history := adminServer.GetTaskHistory()
- t.Logf("Task history contains %d entries", len(history))
-
- t.Logf("Simple integration test completed successfully")
-}
-
-// TestWorkerHeartbeat tests worker heartbeat functionality
-func TestWorkerHeartbeat(t *testing.T) {
- t.Logf("Testing worker heartbeat")
-
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Register a worker
- worker := &types.Worker{
- ID: "heartbeat-worker",
- Address: "localhost:9002",
- Capabilities: []types.TaskType{types.TaskTypeVacuum},
- MaxConcurrent: 1,
- Status: "active",
- CurrentLoad: 0,
- LastHeartbeat: time.Now(),
- }
-
- err = adminServer.RegisterWorker(worker)
- if err != nil {
- t.Fatalf("Failed to register worker: %v", err)
- }
-
- // Test heartbeat update
- status := &types.WorkerStatus{
- Status: "active",
- CurrentLoad: 0,
- }
-
- err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
- if err != nil {
- t.Errorf("Failed to update worker heartbeat: %v", err)
- }
-
- t.Logf("Worker heartbeat test completed successfully")
-}
-
-// TestTaskQueueOperations tests task queue operations
-func TestTaskQueueOperations(t *testing.T) {
- t.Logf("Testing task queue operations")
-
- config := &AdminConfig{
- ScanInterval: 10 * time.Second,
- WorkerTimeout: 30 * time.Second,
- TaskTimeout: 2 * time.Hour,
- MaxRetries: 3,
- ReconcileInterval: 5 * time.Minute,
- EnableFailureRecovery: true,
- MaxConcurrentTasks: 5,
- }
-
- adminServer := NewAdminServer(config, nil)
- err := adminServer.Start()
- if err != nil {
- t.Fatalf("Failed to start admin server: %v", err)
- }
- defer adminServer.Stop()
-
- // Test queuing multiple tasks
- for i := 0; i < 3; i++ {
- task := &types.Task{
- ID: fmt.Sprintf("queue-test-task-%d", i),
- Type: types.TaskTypeVacuum,
- VolumeID: uint32(2000 + i),
- Server: "localhost:8080",
- Status: types.TaskStatusPending,
- Priority: types.TaskPriorityNormal,
- Parameters: map[string]interface{}{
- "garbage_threshold": "0.3",
- },
- CreatedAt: time.Now(),
- }
-
- err = adminServer.QueueTask(task)
- if err != nil {
- t.Errorf("Failed to queue task %d: %v", i, err)
- }
- }
-
- // Check queue size
- queuedCount := adminServer.GetQueuedTaskCount()
- if queuedCount != 3 {
- t.Errorf("Expected 3 queued tasks, got %d", queuedCount)
- }
-
- t.Logf("Task queue operations test completed successfully")
-}