✅ Update : version 0.0.3

keyog0 · keyog0 · commit cb0e5f62df83 · 2022-03-04T13:24:35.000+09:00
diff --git a/ParquetLoader/__init__.py b/ParquetLoader/__init__.py
@@ -1,10 +1,10 @@
 __name__ = 'parquet-loader'
 __description__ = 'Parquet file Load and Read from minio & S3'
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 __url__ = 'https://github.com/Keunyoung-Jung/ParquetLoader'
 __download_url__ = 'https://github.com/Keunyoung-Jung/ParquetLoader'
 __install_requires__ = [
-    "pandas==1.2.0",
+    "pandas<=1.2.0",
     "fastparquet==0.8.0",
     "s3fs"
 ],
diff --git a/ParquetLoader/loader.py b/ParquetLoader/loader.py
@@ -39,8 +39,9 @@ def __init__(self,
         try :
             self.initialize()
         except IndexError as e :
-            print(e)
+            print("IndexError :",e)
             print(f'"{self.root_path}" may be incorrect or it may be an empty folder.')
+            exit()
         
     def initialize(self) :
         path = f'{self.root_path}/{self.folder}{"/*"*self.depth}/*.parquet'
diff --git a/ParquetLoader/s3.py b/ParquetLoader/s3.py
@@ -7,6 +7,9 @@
 class S3Loader(DataLoader) :
     def __init__(self,
                  chunk_size : int =100000,
+                 s3_endpoint_url: str = '',
+                 s3_access_key: str ='',
+                 s3_secret_key: str ='',
                  bucket : str = '.',
                  folder : str = 'data',
                  shuffle : bool = True,
@@ -15,6 +18,13 @@ def __init__(self,
                  depth : int = 0,
                  std_out: bool = True
                  ):
+        self.client_dict = None
+        if s3_endpoint_url != '' or \
+            s3_access_key != '' or \
+            s3_secret_key != '' :
+            self.client_dict = self.create_s3_client_dict(s3_endpoint_url,
+                                                     s3_access_key,
+                                                     s3_secret_key)
         super().__init__(
             chunk_size=chunk_size,
             root_path=bucket,
@@ -29,8 +39,12 @@ def __init__(self,
     def initialize(self) :
         path = f'{self.root_path}/{self.folder}{"/*"*self.depth}/*.parquet'
         
-        s3 = s3fs.S3FileSystem()
-        fs = s3fs.core.S3FileSystem()
+        if self.client_dict != None :
+            s3 = s3fs.S3FileSystem(client_kwargs=self.client_dict)
+            fs = s3fs.core.S3FileSystem(client_kwargs=self.client_dict)
+        else :
+            s3 = s3fs.S3FileSystem()
+            fs = s3fs.core.S3FileSystem()
         
         all_paths = fs.glob(path=path)
         if self.std_out :
@@ -49,4 +63,20 @@ def initialize(self) :
         if self.std_out :
             print(f'{len(all_paths)}files Initialize ... complete {round(time()-start_init_time,2)}sec')
         self.total_num = self.fp_obj.info['row_groups']
-        self.shuffle_seed_list = random.Random(self.random_seed).sample(range(100000+self.total_num*10),self.total_num)
+        self.shuffle_seed_list = random.Random(self.random_seed).sample(range(100000+self.total_num*10),self.total_num)
+        
+    def create_s3_client_dict(self,
+                              s3_endpoint_url,
+                              s3_access_key,
+                              s3_secret_key
+                              ):
+        import os
+        client_dict = {}
+        if s3_endpoint_url != '' :
+            client_dict['endpoint_url'] = s3_endpoint_url
+        if s3_access_key != '' :
+            os.environ['AWS_ACCESS_KEY_ID'] = s3_access_key
+        if s3_secret_key != '' :
+            os.environ['AWS_SECRET_ACCESS_KEY'] = s3_secret_key
+        
+        return client_dict
diff --git a/README.md b/README.md
@@ -130,4 +130,22 @@ print(data_loader.schema) # get data schema
 print(data_loader.columns) # get data columns
 print(data_loader.count) # get total count
 print(data_loader.info) # get metadata infomation
-```
+```
+
+## 6. Customize S3 Path
+If you use minio or other obejct storage,you will use s3 parameters
+```python
+dl = S3Loader(
+    s3_endpoint_url : str = '',
+    s3_access_key : str = '',
+    s3_secret_key : str = '',
+    bucket : str = '.',
+    folder : str = 'data',
+    )
+```
+* `s3_endpoint_url`
+    * Storage endpoint url you want to use
+    * example : "http://mino-service.kubeflow:9000"
+* `s3_access_key` and `s3_secret_key`
+    * you can set s3_access_key and s3_secret_key, but I don't recommend using it
+    * it is recommended to use environment variables.
diff --git a/tests/minio.py b/tests/minio.py
@@ -0,0 +1,15 @@
+from ParquetLoader import S3Loader
+
+def s3_data_loading():
+    sl = S3Loader(
+        s3_endpoint_url='http://localhost:9000',
+        s3_access_key='minio',
+        s3_secret_key='minio123',
+        bucket='test',
+        folder="test-data")
+    print(sl.schema)
+    for df in sl :
+        print(df.head())
+        
+if __name__ == '__main__':
+    s3_data_loading()