diff options
Diffstat (limited to 'test/s3/parquet/test_implicit_directory_fix.py')
| -rwxr-xr-x | test/s3/parquet/test_implicit_directory_fix.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/test/s3/parquet/test_implicit_directory_fix.py b/test/s3/parquet/test_implicit_directory_fix.py new file mode 100755 index 000000000..9ac8f0346 --- /dev/null +++ b/test/s3/parquet/test_implicit_directory_fix.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +""" +Test script to verify the implicit directory fix for s3fs compatibility. + +This test verifies that: +1. Implicit directory markers (0-byte objects with children) return 404 on HEAD +2. s3fs correctly identifies them as directories via LIST fallback +3. PyArrow can read datasets created with write_dataset() + +The fix makes SeaweedFS behave like AWS S3 and improves s3fs compatibility. +""" + +import io +import logging +import os +import sys +import traceback + +import pyarrow as pa +import pyarrow.dataset as pads +import pyarrow.parquet as pq +import s3fs +import boto3 +from botocore.exceptions import ClientError + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration +S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "http://localhost:8333") +S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1") +S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1") +BUCKET_NAME = os.getenv("BUCKET_NAME", "test-implicit-dir") + +def create_sample_table(num_rows: int = 1000) -> pa.Table: + """Create a sample PyArrow table.""" + return pa.table({ + 'id': pa.array(range(num_rows), type=pa.int64()), + 'value': pa.array([f'value_{i}' for i in range(num_rows)], type=pa.string()), + 'score': pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), + }) + +def setup_s3(): + """Set up S3 clients.""" + # s3fs client + fs = s3fs.S3FileSystem( + key=S3_ACCESS_KEY, + secret=S3_SECRET_KEY, + client_kwargs={'endpoint_url': S3_ENDPOINT_URL}, + use_ssl=False + ) + + # boto3 client for raw S3 operations + s3_client = boto3.client( + 's3', + endpoint_url=S3_ENDPOINT_URL, + aws_access_key_id=S3_ACCESS_KEY, + aws_secret_access_key=S3_SECRET_KEY, + use_ssl=False + ) + + return fs, s3_client + +def test_implicit_directory_head_behavior(fs, s3_client): + """Test that HEAD on implicit directory markers returns 404.""" + logger.info("\n" + "="*80) + logger.info("TEST 1: Implicit Directory HEAD Behavior") + logger.info("="*80) + + test_path = f"{BUCKET_NAME}/test_implicit_dir" + + # Clean up any existing data + try: + fs.rm(test_path, recursive=True) + except: + pass + + # Create a dataset using PyArrow (creates implicit directory) + logger.info(f"Creating dataset at: {test_path}") + table = create_sample_table(1000) + pads.write_dataset(table, test_path, filesystem=fs, format='parquet') + + # List what was created + logger.info("\nFiles created:") + files = fs.ls(test_path, detail=True) + for f in files: + logger.info(f" {f['name']} - size: {f['size']} bytes, type: {f['type']}") + + # Test HEAD request on the directory marker (without trailing slash) + logger.info(f"\nTesting HEAD on: {test_path}") + try: + response = s3_client.head_object(Bucket=BUCKET_NAME, Key='test_implicit_dir') + logger.info(f" HEAD response: {response['ResponseMetadata']['HTTPStatusCode']}") + logger.info(f" Content-Length: {response.get('ContentLength', 'N/A')}") + logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}") + logger.warning(" ⚠️ Expected 404, but got 200 - fix may not be working") + return False + except ClientError as e: + if e.response['Error']['Code'] == '404': + logger.info(" ✓ HEAD returned 404 (expected - implicit directory)") + return True + else: + logger.error(f" ✗ Unexpected error: {e}") + return False + +def test_s3fs_directory_detection(fs): + """Test that s3fs correctly detects the directory.""" + logger.info("\n" + "="*80) + logger.info("TEST 2: s3fs Directory Detection") + logger.info("="*80) + + test_path = f"{BUCKET_NAME}/test_implicit_dir" + + # Test s3fs.info() + logger.info(f"\nTesting s3fs.info('{test_path}'):") + try: + info = fs.info(test_path) + logger.info(f" Type: {info.get('type', 'N/A')}") + logger.info(f" Size: {info.get('size', 'N/A')}") + + if info.get('type') == 'directory': + logger.info(" ✓ s3fs correctly identified as directory") + return True + else: + logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}") + return False + except Exception as e: + logger.error(f" ✗ Error: {e}") + return False + +def test_s3fs_isdir(fs): + """Test that s3fs.isdir() works correctly.""" + logger.info("\n" + "="*80) + logger.info("TEST 3: s3fs.isdir() Method") + logger.info("="*80) + + test_path = f"{BUCKET_NAME}/test_implicit_dir" + + logger.info(f"\nTesting s3fs.isdir('{test_path}'):") + try: + is_dir = fs.isdir(test_path) + logger.info(f" Result: {is_dir}") + + if is_dir: + logger.info(" ✓ s3fs.isdir() correctly returned True") + return True + else: + logger.warning(" ⚠️ s3fs.isdir() returned False") + return False + except Exception as e: + logger.error(f" ✗ Error: {e}") + return False + +def test_pyarrow_dataset_read(fs): + """Test that PyArrow can read the dataset.""" + logger.info("\n" + "="*80) + logger.info("TEST 4: PyArrow Dataset Read") + logger.info("="*80) + + test_path = f"{BUCKET_NAME}/test_implicit_dir" + + logger.info(f"\nReading dataset from: {test_path}") + try: + ds = pads.dataset(test_path, filesystem=fs, format='parquet') + table = ds.to_table() + logger.info(f" ✓ Successfully read {len(table)} rows") + logger.info(f" Columns: {table.column_names}") + return True + except Exception as e: + logger.error(f" ✗ Failed to read dataset: {e}") + traceback.print_exc() + return False + +def test_explicit_directory_marker(fs, s3_client): + """Test that explicit directory markers (with trailing slash) still work.""" + logger.info("\n" + "="*80) + logger.info("TEST 5: Explicit Directory Marker (with trailing slash)") + logger.info("="*80) + + # Create an explicit directory marker + logger.info(f"\nCreating explicit directory: {BUCKET_NAME}/explicit_dir/") + try: + s3_client.put_object( + Bucket=BUCKET_NAME, + Key='explicit_dir/', + Body=b'', + ContentType='httpd/unix-directory' + ) + logger.info(" ✓ Created explicit directory marker") + except Exception as e: + logger.error(f" ✗ Failed to create: {e}") + return False + + # Test HEAD with trailing slash + logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/explicit_dir/") + try: + response = s3_client.head_object(Bucket=BUCKET_NAME, Key='explicit_dir/') + logger.info(f" ✓ HEAD returned 200 (expected for explicit directory)") + logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}") + return True + except ClientError as e: + logger.error(f" ✗ HEAD failed: {e}") + return False + +def test_empty_file_not_directory(fs, s3_client): + """Test that legitimate empty files are not treated as directories.""" + logger.info("\n" + "="*80) + logger.info("TEST 6: Empty File (not a directory)") + logger.info("="*80) + + # Create an empty file with text/plain mime type + logger.info(f"\nCreating empty file: {BUCKET_NAME}/empty.txt") + try: + s3_client.put_object( + Bucket=BUCKET_NAME, + Key='empty.txt', + Body=b'', + ContentType='text/plain' + ) + logger.info(" ✓ Created empty file") + except Exception as e: + logger.error(f" ✗ Failed to create: {e}") + return False + + # Test HEAD + logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/empty.txt") + try: + response = s3_client.head_object(Bucket=BUCKET_NAME, Key='empty.txt') + logger.info(f" ✓ HEAD returned 200 (expected for empty file)") + logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}") + + # Verify s3fs doesn't think it's a directory + info = fs.info(f"{BUCKET_NAME}/empty.txt") + if info.get('type') == 'file': + logger.info(" ✓ s3fs correctly identified as file") + return True + else: + logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}") + return False + except Exception as e: + logger.error(f" ✗ Error: {e}") + return False + +def main(): + """Run all tests.""" + logger.info("="*80) + logger.info("Implicit Directory Fix Test Suite") + logger.info("="*80) + logger.info(f"Endpoint: {S3_ENDPOINT_URL}") + logger.info(f"Bucket: {BUCKET_NAME}") + logger.info("="*80) + + # Set up S3 clients + fs, s3_client = setup_s3() + + # Create bucket if it doesn't exist + try: + s3_client.create_bucket(Bucket=BUCKET_NAME) + logger.info(f"\n✓ Created bucket: {BUCKET_NAME}") + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code in ['BucketAlreadyOwnedByYou', 'BucketAlreadyExists']: + logger.info(f"\n✓ Bucket already exists: {BUCKET_NAME}") + else: + logger.error(f"\n✗ Failed to create bucket: {e}") + return 1 + + # Run tests + results = [] + + results.append(("Implicit Directory HEAD", test_implicit_directory_head_behavior(fs, s3_client))) + results.append(("s3fs Directory Detection", test_s3fs_directory_detection(fs))) + results.append(("s3fs.isdir() Method", test_s3fs_isdir(fs))) + results.append(("PyArrow Dataset Read", test_pyarrow_dataset_read(fs))) + results.append(("Explicit Directory Marker", test_explicit_directory_marker(fs, s3_client))) + results.append(("Empty File Not Directory", test_empty_file_not_directory(fs, s3_client))) + + # Print summary + logger.info("\n" + "="*80) + logger.info("TEST SUMMARY") + logger.info("="*80) + + passed = sum(1 for _, result in results if result) + total = len(results) + + for name, result in results: + status = "✓ PASS" if result else "✗ FAIL" + logger.info(f"{status}: {name}") + + logger.info("="*80) + logger.info(f"Results: {passed}/{total} tests passed") + logger.info("="*80) + + if passed == total: + logger.info("\n🎉 All tests passed! The implicit directory fix is working correctly.") + return 0 + else: + logger.warning(f"\n⚠️ {total - passed} test(s) failed. The fix may not be fully working.") + return 1 + +if __name__ == "__main__": + sys.exit(main()) + |
