aboutsummaryrefslogtreecommitdiff
path: root/test/s3/parquet/test_implicit_directory_fix.py
diff options
context:
space:
mode:
Diffstat (limited to 'test/s3/parquet/test_implicit_directory_fix.py')
-rwxr-xr-xtest/s3/parquet/test_implicit_directory_fix.py307
1 files changed, 307 insertions, 0 deletions
diff --git a/test/s3/parquet/test_implicit_directory_fix.py b/test/s3/parquet/test_implicit_directory_fix.py
new file mode 100755
index 000000000..9ac8f0346
--- /dev/null
+++ b/test/s3/parquet/test_implicit_directory_fix.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the implicit directory fix for s3fs compatibility.
+
+This test verifies that:
+1. Implicit directory markers (0-byte objects with children) return 404 on HEAD
+2. s3fs correctly identifies them as directories via LIST fallback
+3. PyArrow can read datasets created with write_dataset()
+
+The fix makes SeaweedFS behave like AWS S3 and improves s3fs compatibility.
+"""
+
+import io
+import logging
+import os
+import sys
+import traceback
+
+import pyarrow as pa
+import pyarrow.dataset as pads
+import pyarrow.parquet as pq
+import s3fs
+import boto3
+from botocore.exceptions import ClientError
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "http://localhost:8333")
+S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
+BUCKET_NAME = os.getenv("BUCKET_NAME", "test-implicit-dir")
+
+def create_sample_table(num_rows: int = 1000) -> pa.Table:
+ """Create a sample PyArrow table."""
+ return pa.table({
+ 'id': pa.array(range(num_rows), type=pa.int64()),
+ 'value': pa.array([f'value_{i}' for i in range(num_rows)], type=pa.string()),
+ 'score': pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+ })
+
+def setup_s3():
+ """Set up S3 clients."""
+ # s3fs client
+ fs = s3fs.S3FileSystem(
+ key=S3_ACCESS_KEY,
+ secret=S3_SECRET_KEY,
+ client_kwargs={'endpoint_url': S3_ENDPOINT_URL},
+ use_ssl=False
+ )
+
+ # boto3 client for raw S3 operations
+ s3_client = boto3.client(
+ 's3',
+ endpoint_url=S3_ENDPOINT_URL,
+ aws_access_key_id=S3_ACCESS_KEY,
+ aws_secret_access_key=S3_SECRET_KEY,
+ use_ssl=False
+ )
+
+ return fs, s3_client
+
+def test_implicit_directory_head_behavior(fs, s3_client):
+ """Test that HEAD on implicit directory markers returns 404."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 1: Implicit Directory HEAD Behavior")
+ logger.info("="*80)
+
+ test_path = f"{BUCKET_NAME}/test_implicit_dir"
+
+ # Clean up any existing data
+ try:
+ fs.rm(test_path, recursive=True)
+ except:
+ pass
+
+ # Create a dataset using PyArrow (creates implicit directory)
+ logger.info(f"Creating dataset at: {test_path}")
+ table = create_sample_table(1000)
+ pads.write_dataset(table, test_path, filesystem=fs, format='parquet')
+
+ # List what was created
+ logger.info("\nFiles created:")
+ files = fs.ls(test_path, detail=True)
+ for f in files:
+ logger.info(f" {f['name']} - size: {f['size']} bytes, type: {f['type']}")
+
+ # Test HEAD request on the directory marker (without trailing slash)
+ logger.info(f"\nTesting HEAD on: {test_path}")
+ try:
+ response = s3_client.head_object(Bucket=BUCKET_NAME, Key='test_implicit_dir')
+ logger.info(f" HEAD response: {response['ResponseMetadata']['HTTPStatusCode']}")
+ logger.info(f" Content-Length: {response.get('ContentLength', 'N/A')}")
+ logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
+ logger.warning(" ⚠️ Expected 404, but got 200 - fix may not be working")
+ return False
+ except ClientError as e:
+ if e.response['Error']['Code'] == '404':
+ logger.info(" ✓ HEAD returned 404 (expected - implicit directory)")
+ return True
+ else:
+ logger.error(f" ✗ Unexpected error: {e}")
+ return False
+
+def test_s3fs_directory_detection(fs):
+ """Test that s3fs correctly detects the directory."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 2: s3fs Directory Detection")
+ logger.info("="*80)
+
+ test_path = f"{BUCKET_NAME}/test_implicit_dir"
+
+ # Test s3fs.info()
+ logger.info(f"\nTesting s3fs.info('{test_path}'):")
+ try:
+ info = fs.info(test_path)
+ logger.info(f" Type: {info.get('type', 'N/A')}")
+ logger.info(f" Size: {info.get('size', 'N/A')}")
+
+ if info.get('type') == 'directory':
+ logger.info(" ✓ s3fs correctly identified as directory")
+ return True
+ else:
+ logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}")
+ return False
+ except Exception as e:
+ logger.error(f" ✗ Error: {e}")
+ return False
+
+def test_s3fs_isdir(fs):
+ """Test that s3fs.isdir() works correctly."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 3: s3fs.isdir() Method")
+ logger.info("="*80)
+
+ test_path = f"{BUCKET_NAME}/test_implicit_dir"
+
+ logger.info(f"\nTesting s3fs.isdir('{test_path}'):")
+ try:
+ is_dir = fs.isdir(test_path)
+ logger.info(f" Result: {is_dir}")
+
+ if is_dir:
+ logger.info(" ✓ s3fs.isdir() correctly returned True")
+ return True
+ else:
+ logger.warning(" ⚠️ s3fs.isdir() returned False")
+ return False
+ except Exception as e:
+ logger.error(f" ✗ Error: {e}")
+ return False
+
+def test_pyarrow_dataset_read(fs):
+ """Test that PyArrow can read the dataset."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 4: PyArrow Dataset Read")
+ logger.info("="*80)
+
+ test_path = f"{BUCKET_NAME}/test_implicit_dir"
+
+ logger.info(f"\nReading dataset from: {test_path}")
+ try:
+ ds = pads.dataset(test_path, filesystem=fs, format='parquet')
+ table = ds.to_table()
+ logger.info(f" ✓ Successfully read {len(table)} rows")
+ logger.info(f" Columns: {table.column_names}")
+ return True
+ except Exception as e:
+ logger.error(f" ✗ Failed to read dataset: {e}")
+ traceback.print_exc()
+ return False
+
+def test_explicit_directory_marker(fs, s3_client):
+ """Test that explicit directory markers (with trailing slash) still work."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 5: Explicit Directory Marker (with trailing slash)")
+ logger.info("="*80)
+
+ # Create an explicit directory marker
+ logger.info(f"\nCreating explicit directory: {BUCKET_NAME}/explicit_dir/")
+ try:
+ s3_client.put_object(
+ Bucket=BUCKET_NAME,
+ Key='explicit_dir/',
+ Body=b'',
+ ContentType='httpd/unix-directory'
+ )
+ logger.info(" ✓ Created explicit directory marker")
+ except Exception as e:
+ logger.error(f" ✗ Failed to create: {e}")
+ return False
+
+ # Test HEAD with trailing slash
+ logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/explicit_dir/")
+ try:
+ response = s3_client.head_object(Bucket=BUCKET_NAME, Key='explicit_dir/')
+ logger.info(f" ✓ HEAD returned 200 (expected for explicit directory)")
+ logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
+ return True
+ except ClientError as e:
+ logger.error(f" ✗ HEAD failed: {e}")
+ return False
+
+def test_empty_file_not_directory(fs, s3_client):
+ """Test that legitimate empty files are not treated as directories."""
+ logger.info("\n" + "="*80)
+ logger.info("TEST 6: Empty File (not a directory)")
+ logger.info("="*80)
+
+ # Create an empty file with text/plain mime type
+ logger.info(f"\nCreating empty file: {BUCKET_NAME}/empty.txt")
+ try:
+ s3_client.put_object(
+ Bucket=BUCKET_NAME,
+ Key='empty.txt',
+ Body=b'',
+ ContentType='text/plain'
+ )
+ logger.info(" ✓ Created empty file")
+ except Exception as e:
+ logger.error(f" ✗ Failed to create: {e}")
+ return False
+
+ # Test HEAD
+ logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/empty.txt")
+ try:
+ response = s3_client.head_object(Bucket=BUCKET_NAME, Key='empty.txt')
+ logger.info(f" ✓ HEAD returned 200 (expected for empty file)")
+ logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
+
+ # Verify s3fs doesn't think it's a directory
+ info = fs.info(f"{BUCKET_NAME}/empty.txt")
+ if info.get('type') == 'file':
+ logger.info(" ✓ s3fs correctly identified as file")
+ return True
+ else:
+ logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}")
+ return False
+ except Exception as e:
+ logger.error(f" ✗ Error: {e}")
+ return False
+
+def main():
+ """Run all tests."""
+ logger.info("="*80)
+ logger.info("Implicit Directory Fix Test Suite")
+ logger.info("="*80)
+ logger.info(f"Endpoint: {S3_ENDPOINT_URL}")
+ logger.info(f"Bucket: {BUCKET_NAME}")
+ logger.info("="*80)
+
+ # Set up S3 clients
+ fs, s3_client = setup_s3()
+
+ # Create bucket if it doesn't exist
+ try:
+ s3_client.create_bucket(Bucket=BUCKET_NAME)
+ logger.info(f"\n✓ Created bucket: {BUCKET_NAME}")
+ except ClientError as e:
+ error_code = e.response['Error']['Code']
+ if error_code in ['BucketAlreadyOwnedByYou', 'BucketAlreadyExists']:
+ logger.info(f"\n✓ Bucket already exists: {BUCKET_NAME}")
+ else:
+ logger.error(f"\n✗ Failed to create bucket: {e}")
+ return 1
+
+ # Run tests
+ results = []
+
+ results.append(("Implicit Directory HEAD", test_implicit_directory_head_behavior(fs, s3_client)))
+ results.append(("s3fs Directory Detection", test_s3fs_directory_detection(fs)))
+ results.append(("s3fs.isdir() Method", test_s3fs_isdir(fs)))
+ results.append(("PyArrow Dataset Read", test_pyarrow_dataset_read(fs)))
+ results.append(("Explicit Directory Marker", test_explicit_directory_marker(fs, s3_client)))
+ results.append(("Empty File Not Directory", test_empty_file_not_directory(fs, s3_client)))
+
+ # Print summary
+ logger.info("\n" + "="*80)
+ logger.info("TEST SUMMARY")
+ logger.info("="*80)
+
+ passed = sum(1 for _, result in results if result)
+ total = len(results)
+
+ for name, result in results:
+ status = "✓ PASS" if result else "✗ FAIL"
+ logger.info(f"{status}: {name}")
+
+ logger.info("="*80)
+ logger.info(f"Results: {passed}/{total} tests passed")
+ logger.info("="*80)
+
+ if passed == total:
+ logger.info("\n🎉 All tests passed! The implicit directory fix is working correctly.")
+ return 0
+ else:
+ logger.warning(f"\n⚠️ {total - passed} test(s) failed. The fix may not be fully working.")
+ return 1
+
+if __name__ == "__main__":
+ sys.exit(main())
+