aboutsummaryrefslogtreecommitdiff
path: root/seaweedfs-rdma-sidecar/docker/scripts/setup-soft-roce.sh
blob: 55c8f3b80541c4af0e4528872d712449b8b015b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/bin/bash

# Setup Soft-RoCE (RXE) for RDMA simulation
# This script enables RDMA over Ethernet using the RXE kernel module

set -e

echo "๐Ÿ”ง Setting up Soft-RoCE (RXE) RDMA simulation..."

# Function to check if running with required privileges
check_privileges() {
    if [ "$EUID" -ne 0 ]; then
        echo "โŒ This script requires root privileges"
        echo "Run with: sudo $0 or inside a privileged container"
        exit 1
    fi
}

# Function to load RXE kernel module
load_rxe_module() {
    echo "๐Ÿ“ฆ Loading RXE kernel module..."
    
    # Try to load the rdma_rxe module
    if modprobe rdma_rxe 2>/dev/null; then
        echo "โœ… rdma_rxe module loaded successfully"
    else
        echo "โš ๏ธ  Failed to load rdma_rxe module, trying alternative approach..."
        
        # Alternative: Try loading rxe_net (older kernels)
        if modprobe rxe_net 2>/dev/null; then
            echo "โœ… rxe_net module loaded successfully"
        else
            echo "โŒ Failed to load RXE modules. Possible causes:"
            echo "  - Kernel doesn't support RXE (needs CONFIG_RDMA_RXE=m)"
            echo "  - Running in unprivileged container"
            echo "  - Missing kernel modules"
            echo ""
            echo "๐Ÿ”ง Workaround: Run container with --privileged flag"
            exit 1
        fi
    fi
    
    # Verify module is loaded
    if lsmod | grep -q "rdma_rxe\|rxe_net"; then
        echo "โœ… RXE module verification successful"
    else
        echo "โŒ RXE module verification failed"
        exit 1
    fi
}

# Function to setup virtual RDMA device
setup_rxe_device() {
    echo "๐ŸŒ Setting up RXE device over Ethernet interface..."
    
    # Find available network interface (prefer eth0, fallback to others)
    local interface=""
    for iface in eth0 enp0s3 enp0s8 lo; do
        if ip link show "$iface" >/dev/null 2>&1; then
            interface="$iface"
            break
        fi
    done
    
    if [ -z "$interface" ]; then
        echo "โŒ No suitable network interface found"
        echo "Available interfaces:"
        ip link show | grep "^[0-9]" | cut -d':' -f2 | tr -d ' '
        exit 1
    fi
    
    echo "๐Ÿ“ก Using network interface: $interface"
    
    # Create RXE device
    echo "๐Ÿ”จ Creating RXE device on $interface..."
    
    # Try modern rxe_cfg approach first
    if command -v rxe_cfg >/dev/null 2>&1; then
        rxe_cfg add "$interface" || {
            echo "โš ๏ธ  rxe_cfg failed, trying manual approach..."
            setup_rxe_manual "$interface"
        }
    else
        echo "โš ๏ธ  rxe_cfg not available, using manual setup..."
        setup_rxe_manual "$interface"
    fi
}

# Function to manually setup RXE device
setup_rxe_manual() {
    local interface="$1"
    
    # Use sysfs interface to create RXE device
    if [ -d /sys/module/rdma_rxe ]; then
        echo "$interface" > /sys/module/rdma_rxe/parameters/add 2>/dev/null || {
            echo "โŒ Failed to add RXE device via sysfs"
            exit 1
        }
    else
        echo "โŒ RXE sysfs interface not found"
        exit 1
    fi
}

# Function to verify RDMA devices
verify_rdma_devices() {
    echo "๐Ÿ” Verifying RDMA devices..."
    
    # Check for RDMA devices
    if [ -d /sys/class/infiniband ]; then
        local devices=$(ls /sys/class/infiniband/ 2>/dev/null | wc -l)
        if [ "$devices" -gt 0 ]; then
            echo "โœ… Found $devices RDMA device(s):"
            ls /sys/class/infiniband/
            
            # Show device details
            for device in /sys/class/infiniband/*; do
                if [ -d "$device" ]; then
                    local dev_name=$(basename "$device")
                    echo "  ๐Ÿ“‹ Device: $dev_name"
                    
                    # Try to get device info
                    if command -v ibv_devinfo >/dev/null 2>&1; then
                        ibv_devinfo -d "$dev_name" | head -10
                    fi
                fi
            done
        else
            echo "โŒ No RDMA devices found in /sys/class/infiniband/"
            exit 1
        fi
    else
        echo "โŒ /sys/class/infiniband directory not found"
        exit 1
    fi
}

# Function to test basic RDMA functionality
test_basic_rdma() {
    echo "๐Ÿงช Testing basic RDMA functionality..."
    
    # Test libibverbs
    if command -v ibv_devinfo >/dev/null 2>&1; then
        echo "๐Ÿ“‹ RDMA device information:"
        ibv_devinfo | head -20
    else
        echo "โš ๏ธ  ibv_devinfo not available"
    fi
    
    # Test UCX if available
    if command -v ucx_info >/dev/null 2>&1; then
        echo "๐Ÿ“‹ UCX information:"
        ucx_info -d | head -10
    else
        echo "โš ๏ธ  UCX tools not available"
    fi
}

# Main execution
main() {
    echo "๐Ÿš€ Starting Soft-RoCE RDMA simulation setup..."
    echo "======================================"
    
    check_privileges
    load_rxe_module
    setup_rxe_device  
    verify_rdma_devices
    test_basic_rdma
    
    echo ""
    echo "๐ŸŽ‰ Soft-RoCE setup completed successfully!"
    echo "======================================"
    echo "โœ… RDMA simulation is ready for testing"
    echo "๐Ÿ“ก You can now run RDMA applications"
    echo ""
    echo "Next steps:"
    echo "  - Test with: /opt/rdma-sim/test-rdma.sh"
    echo "  - Check UCX: /opt/rdma-sim/ucx-info.sh"
    echo "  - Run your RDMA applications"
}

# Execute main function
main "$@"