1 files changed, 307 insertions, 0 deletions
diff --git a/test/s3/parquet/test_implicit_directory_fix.py b/test/s3/parquet/test_implicit_directory_fix.py
new file mode 100755
index 000000000..9ac8f0346
--- /dev/null
+++ b/test/s3/parquet/test_implicit_directory_fix.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the implicit directory fix for s3fs compatibility.
+
+This test verifies that:
+1. Implicit directory markers (0-byte objects with children) return 404 on HEAD
+2. s3fs correctly identifies them as directories via LIST fallback
+3. PyArrow can read datasets created with write_dataset()
+
+The fix makes SeaweedFS behave like AWS S3 and improves s3fs compatibility.
+"""
+
+import io
+import logging
+import os
+import sys
+import traceback
+
+import pyarrow as pa
+import pyarrow.dataset as pads
+import pyarrow.parquet as pq
+import s3fs
+import boto3
+from botocore.exceptions import ClientError
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "http://localhost:8333")
+S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
+BUCKET_NAME = os.getenv("BUCKET_NAME", "test-implicit-dir")
+
+def create_sample_table(num_rows: int = 1000) -> pa.Table:
+    """Create a sample PyArrow table."""
+    return pa.table({
+        'id': pa.array(range(num_rows), type=pa.int64()),
+        'value': pa.array([f'value_{i}' for i in range(num_rows)], type=pa.string()),
+        'score': pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+    })
+
+def setup_s3():
+    """Set up S3 clients."""
+    # s3fs client
+    fs = s3fs.S3FileSystem(
+        key=S3_ACCESS_KEY,
+        secret=S3_SECRET_KEY,
+        client_kwargs={'endpoint_url': S3_ENDPOINT_URL},
+        use_ssl=False
+    )
+    
+    # boto3 client for raw S3 operations
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=S3_ENDPOINT_URL,
+        aws_access_key_id=S3_ACCESS_KEY,
+        aws_secret_access_key=S3_SECRET_KEY,
+        use_ssl=False
+    )
+    
+    return fs, s3_client
+
+def test_implicit_directory_head_behavior(fs, s3_client):
+    """Test that HEAD on implicit directory markers returns 404."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 1: Implicit Directory HEAD Behavior")
+    logger.info("="*80)
+    
+    test_path = f"{BUCKET_NAME}/test_implicit_dir"
+    
+    # Clean up any existing data
+    try:
+        fs.rm(test_path, recursive=True)
+    except:
+        pass
+    
+    # Create a dataset using PyArrow (creates implicit directory)
+    logger.info(f"Creating dataset at: {test_path}")
+    table = create_sample_table(1000)
+    pads.write_dataset(table, test_path, filesystem=fs, format='parquet')
+    
+    # List what was created
+    logger.info("\nFiles created:")
+    files = fs.ls(test_path, detail=True)
+    for f in files:
+        logger.info(f"  {f['name']} - size: {f['size']} bytes, type: {f['type']}")
+    
+    # Test HEAD request on the directory marker (without trailing slash)
+    logger.info(f"\nTesting HEAD on: {test_path}")
+    try:
+        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='test_implicit_dir')
+        logger.info(f"  HEAD response: {response['ResponseMetadata']['HTTPStatusCode']}")
+        logger.info(f"  Content-Length: {response.get('ContentLength', 'N/A')}")
+        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")
+        logger.warning("  ⚠️  Expected 404, but got 200 - fix may not be working")
+        return False
+    except ClientError as e:
+        if e.response['Error']['Code'] == '404':
+            logger.info("  ✓ HEAD returned 404 (expected - implicit directory)")
+            return True
+        else:
+            logger.error(f"  ✗ Unexpected error: {e}")
+            return False
+
+def test_s3fs_directory_detection(fs):
+    """Test that s3fs correctly detects the directory."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 2: s3fs Directory Detection")
+    logger.info("="*80)
+    
+    test_path = f"{BUCKET_NAME}/test_implicit_dir"
+    
+    # Test s3fs.info()
+    logger.info(f"\nTesting s3fs.info('{test_path}'):")
+    try:
+        info = fs.info(test_path)
+        logger.info(f"  Type: {info.get('type', 'N/A')}")
+        logger.info(f"  Size: {info.get('size', 'N/A')}")
+        
+        if info.get('type') == 'directory':
+            logger.info("  ✓ s3fs correctly identified as directory")
+            return True
+        else:
+            logger.warning(f"  ⚠️  s3fs identified as: {info.get('type')}")
+            return False
+    except Exception as e:
+        logger.error(f"  ✗ Error: {e}")
+        return False
+
+def test_s3fs_isdir(fs):
+    """Test that s3fs.isdir() works correctly."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 3: s3fs.isdir() Method")
+    logger.info("="*80)
+    
+    test_path = f"{BUCKET_NAME}/test_implicit_dir"
+    
+    logger.info(f"\nTesting s3fs.isdir('{test_path}'):")
+    try:
+        is_dir = fs.isdir(test_path)
+        logger.info(f"  Result: {is_dir}")
+        
+        if is_dir:
+            logger.info("  ✓ s3fs.isdir() correctly returned True")
+            return True
+        else:
+            logger.warning("  ⚠️  s3fs.isdir() returned False")
+            return False
+    except Exception as e:
+        logger.error(f"  ✗ Error: {e}")
+        return False
+
+def test_pyarrow_dataset_read(fs):
+    """Test that PyArrow can read the dataset."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 4: PyArrow Dataset Read")
+    logger.info("="*80)
+    
+    test_path = f"{BUCKET_NAME}/test_implicit_dir"
+    
+    logger.info(f"\nReading dataset from: {test_path}")
+    try:
+        ds = pads.dataset(test_path, filesystem=fs, format='parquet')
+        table = ds.to_table()
+        logger.info(f"  ✓ Successfully read {len(table)} rows")
+        logger.info(f"  Columns: {table.column_names}")
+        return True
+    except Exception as e:
+        logger.error(f"  ✗ Failed to read dataset: {e}")
+        traceback.print_exc()
+        return False
+
+def test_explicit_directory_marker(fs, s3_client):
+    """Test that explicit directory markers (with trailing slash) still work."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 5: Explicit Directory Marker (with trailing slash)")
+    logger.info("="*80)
+    
+    # Create an explicit directory marker
+    logger.info(f"\nCreating explicit directory: {BUCKET_NAME}/explicit_dir/")
+    try:
+        s3_client.put_object(
+            Bucket=BUCKET_NAME,
+            Key='explicit_dir/',
+            Body=b'',
+            ContentType='httpd/unix-directory'
+        )
+        logger.info("  ✓ Created explicit directory marker")
+    except Exception as e:
+        logger.error(f"  ✗ Failed to create: {e}")
+        return False
+    
+    # Test HEAD with trailing slash
+    logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/explicit_dir/")
+    try:
+        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='explicit_dir/')
+        logger.info(f"  ✓ HEAD returned 200 (expected for explicit directory)")
+        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")
+        return True
+    except ClientError as e:
+        logger.error(f"  ✗ HEAD failed: {e}")
+        return False
+
+def test_empty_file_not_directory(fs, s3_client):
+    """Test that legitimate empty files are not treated as directories."""
+    logger.info("\n" + "="*80)
+    logger.info("TEST 6: Empty File (not a directory)")
+    logger.info("="*80)
+    
+    # Create an empty file with text/plain mime type
+    logger.info(f"\nCreating empty file: {BUCKET_NAME}/empty.txt")
+    try:
+        s3_client.put_object(
+            Bucket=BUCKET_NAME,
+            Key='empty.txt',
+            Body=b'',
+            ContentType='text/plain'
+        )
+        logger.info("  ✓ Created empty file")
+    except Exception as e:
+        logger.error(f"  ✗ Failed to create: {e}")
+        return False
+    
+    # Test HEAD
+    logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/empty.txt")
+    try:
+        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='empty.txt')
+        logger.info(f"  ✓ HEAD returned 200 (expected for empty file)")
+        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")
+        
+        # Verify s3fs doesn't think it's a directory
+        info = fs.info(f"{BUCKET_NAME}/empty.txt")
+        if info.get('type') == 'file':
+            logger.info("  ✓ s3fs correctly identified as file")
+            return True
+        else:
+            logger.warning(f"  ⚠️  s3fs identified as: {info.get('type')}")
+            return False
+    except Exception as e:
+        logger.error(f"  ✗ Error: {e}")
+        return False
+
+def main():
+    """Run all tests."""
+    logger.info("="*80)
+    logger.info("Implicit Directory Fix Test Suite")
+    logger.info("="*80)
+    logger.info(f"Endpoint: {S3_ENDPOINT_URL}")
+    logger.info(f"Bucket: {BUCKET_NAME}")
+    logger.info("="*80)
+    
+    # Set up S3 clients
+    fs, s3_client = setup_s3()
+    
+    # Create bucket if it doesn't exist
+    try:
+        s3_client.create_bucket(Bucket=BUCKET_NAME)
+        logger.info(f"\n✓ Created bucket: {BUCKET_NAME}")
+    except ClientError as e:
+        error_code = e.response['Error']['Code']
+        if error_code in ['BucketAlreadyOwnedByYou', 'BucketAlreadyExists']:
+            logger.info(f"\n✓ Bucket already exists: {BUCKET_NAME}")
+        else:
+            logger.error(f"\n✗ Failed to create bucket: {e}")
+            return 1
+    
+    # Run tests
+    results = []
+    
+    results.append(("Implicit Directory HEAD", test_implicit_directory_head_behavior(fs, s3_client)))
+    results.append(("s3fs Directory Detection", test_s3fs_directory_detection(fs)))
+    results.append(("s3fs.isdir() Method", test_s3fs_isdir(fs)))
+    results.append(("PyArrow Dataset Read", test_pyarrow_dataset_read(fs)))
+    results.append(("Explicit Directory Marker", test_explicit_directory_marker(fs, s3_client)))
+    results.append(("Empty File Not Directory", test_empty_file_not_directory(fs, s3_client)))
+    
+    # Print summary
+    logger.info("\n" + "="*80)
+    logger.info("TEST SUMMARY")
+    logger.info("="*80)
+    
+    passed = sum(1 for _, result in results if result)
+    total = len(results)
+    
+    for name, result in results:
+        status = "✓ PASS" if result else "✗ FAIL"
+        logger.info(f"{status}: {name}")
+    
+    logger.info("="*80)
+    logger.info(f"Results: {passed}/{total} tests passed")
+    logger.info("="*80)
+    
+    if passed == total:
+        logger.info("\n🎉 All tests passed! The implicit directory fix is working correctly.")
+        return 0
+    else:
+        logger.warning(f"\n⚠️  {total - passed} test(s) failed. The fix may not be fully working.")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
+