diff --git a/.github/workflows/data-check.yml b/.github/workflows/data-check.yml index 0753c5a39..aad5a4571 100644 --- a/.github/workflows/data-check.yml +++ b/.github/workflows/data-check.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install basedosdados==1.6.4b5 pyarrow pytest toml + pip install basedosdados==1.6.4 pyarrow pytest toml - name: Set up base dos dados environment shell: bash env: diff --git a/.github/workflows/metadata-validate.yml b/.github/workflows/metadata-validate.yml index f134c023a..fcdf52fd6 100644 --- a/.github/workflows/metadata-validate.yml +++ b/.github/workflows/metadata-validate.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install basedosdados==1.6.4b5 toml + pip install basedosdados==1.6.4 toml - name: Set up base dos dados environment run: python .github/workflows/env-setup/env_setup.py shell: bash diff --git a/.github/workflows/table-approve.yml b/.github/workflows/table-approve.yml index 15ca7a6c0..f2070ecf6 100644 --- a/.github/workflows/table-approve.yml +++ b/.github/workflows/table-approve.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install basedosdados==1.6.4b5 toml + pip install basedosdados==1.6.4 toml - name: Set up gcloud uses: google-github-actions/setup-gcloud@v0 with: @@ -116,7 +116,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install basedosdados==1.6.4b5 pyarrow pytest toml + pip install basedosdados==1.6.4 pyarrow pytest toml - name: Set up basedosdados environment run: | cd .github/workflows/env-setup diff --git a/python-package/basedosdados/cli/cli.py b/python-package/basedosdados/cli/cli.py index fe03cefb0..32e5bf719 100644 --- a/python-package/basedosdados/cli/cli.py +++ b/python-package/basedosdados/cli/cli.py @@ -280,6 +280,11 @@ def init_table( default=None, help="Location of dataset data. List of possible region names locations: https://cloud.google.com/bigquery/docs/locations", ) +@click.option( + "--chunk_size", + default=None, + help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.", +) @click.pass_context def create_table( ctx, @@ -295,6 +300,7 @@ def create_table( columns_config_url_or_path, dataset_is_public, location, + chunk_size, ): Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).create( @@ -308,6 +314,7 @@ def create_table( columns_config_url_or_path=columns_config_url_or_path, dataset_is_public=dataset_is_public, location=location, + chunk_size=chunk_size, ) click.echo( @@ -428,11 +435,21 @@ def delete_table(ctx, dataset_id, table_id, mode): default="raise", help="[raise|replace|pass] if file alread exists", ) +@click.option( + "--chunk_size", + default=None, + help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.", +) @click.pass_context -def upload_table(ctx, dataset_id, table_id, filepath, partitions, if_exists): +def upload_table( + ctx, dataset_id, table_id, filepath, partitions, if_exists, chunk_size +): blob_name = Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).append( - filepath=filepath, partitions=partitions, if_exists=if_exists + filepath=filepath, + partitions=partitions, + if_exists=if_exists, + chunk_size=chunk_size, ) click.echo( @@ -493,12 +510,23 @@ def init_storage(ctx, bucket_name, replace, very_sure): default="raise", help="[raise|replace|pass] if file alread exists", ) +@click.option( + "--chunk_size", + default=None, + help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.", +) @click.pass_context -def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists): +def upload_storage( + ctx, dataset_id, table_id, filepath, mode, partitions, if_exists, chunk_size +): ctx.obj.pop("bucket_name") blob_name = Storage(dataset_id, table_id, **ctx.obj).upload( - filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists + filepath=filepath, + mode=mode, + partitions=partitions, + if_exists=if_exists, + chunk_size=chunk_size, ) click.echo( diff --git a/python-package/basedosdados/upload/dataset.py b/python-package/basedosdados/upload/dataset.py index d6f87b0a4..2b0648ff1 100644 --- a/python-package/basedosdados/upload/dataset.py +++ b/python-package/basedosdados/upload/dataset.py @@ -120,28 +120,39 @@ def publicize(self, mode="all", dataset_is_public=True): dataset = m["client"].get_dataset(m["id"]) entries = dataset.access_entries # TODO https://github.com/basedosdados/mais/pull/1020 - if dataset_is_public and "staging" not in dataset.dataset_id: - entries.extend( - [ - bigquery.AccessEntry( - role="roles/bigquery.dataViewer", - entity_type="iamMember", - entity_id="allUsers", - ), - bigquery.AccessEntry( - role="roles/bigquery.metadataViewer", - entity_type="iamMember", - entity_id="allUsers", - ), - bigquery.AccessEntry( - role="roles/bigquery.user", - entity_type="iamMember", - entity_id="allUsers", - ), - ] - ) + # TODO if staging dataset is private, the prod view can't acess it: if dataset_is_public and "staging" not in dataset.dataset_id: + if dataset_is_public: + if "staging" not in dataset.dataset_id: + entries.extend( + [ + bigquery.AccessEntry( + role="roles/bigquery.dataViewer", + entity_type="iamMember", + entity_id="allUsers", + ), + bigquery.AccessEntry( + role="roles/bigquery.metadataViewer", + entity_type="iamMember", + entity_id="allUsers", + ), + bigquery.AccessEntry( + role="roles/bigquery.user", + entity_type="iamMember", + entity_id="allUsers", + ), + ] + ) + else: + entries.extend( + [ + bigquery.AccessEntry( + role="roles/bigquery.dataViewer", + entity_type="iamMember", + entity_id="allUsers", + ), + ] + ) dataset.access_entries = entries - m["client"].update_dataset(dataset, ["access_entries"]) logger.success( " {object} {object_id}_{mode} was {action}!", diff --git a/python-package/basedosdados/upload/metadata.py b/python-package/basedosdados/upload/metadata.py index ecd9e9a23..435405e71 100644 --- a/python-package/basedosdados/upload/metadata.py +++ b/python-package/basedosdados/upload/metadata.py @@ -212,14 +212,10 @@ def metadata_schema(self) -> dict: if self.table_id: table_url = f"{self.CKAN_URL}/api/3/action/bd_bdm_table_schema" - table_schema = requests.get(table_url).json().get("result") - - return table_schema + return requests.get(table_url).json().get("result") dataset_url = f"{self.CKAN_URL}/api/3/action/bd_dataset_schema" - dataset_schema = requests.get(dataset_url).json().get("result") - - return dataset_schema + return requests.get(dataset_url).json().get("result") def exists_in_ckan(self) -> bool: """Check if Metadata object refers to an existing CKAN package or reso diff --git a/python-package/basedosdados/upload/storage.py b/python-package/basedosdados/upload/storage.py index 876a728f7..e2580051d 100644 --- a/python-package/basedosdados/upload/storage.py +++ b/python-package/basedosdados/upload/storage.py @@ -113,6 +113,7 @@ def upload( mode="all", partitions=None, if_exists="raise", + chunk_size=None, **upload_args, ): """Upload to storage at `///`. You can: @@ -158,6 +159,10 @@ def upload( * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing + chunk_size (int): Optional + The size of a chunk of data whenever iterating (in bytes). + This must be a multiple of 256 KB per the API specification. + If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used. upload_args (): Extra arguments accepted by [`google.cloud.storage.blob.Blob.upload_from_file`](https://googleapis.dev/python/storage/latest/blobs.html?highlight=upload_from_filename#google.cloud.storage.blob.Blob.upload_from_filename) @@ -169,7 +174,11 @@ def upload( path = Path(path) if path.is_dir(): - paths = [f for f in path.glob("**/*") if f.is_file() and f.suffix == ".csv"] + paths = [ + f + for f in path.glob("**/*") + if f.is_file() and f.suffix in [".csv", ".parquet", "parquet.gzip"] + ] parts = [ ( @@ -197,7 +206,7 @@ def upload( blob_name = self._build_blob_name(filepath.name, m, part) - blob = self.bucket.blob(blob_name) + blob = self.bucket.blob(blob_name, chunk_size=chunk_size) if not blob.exists() or if_exists == "replace": diff --git a/python-package/basedosdados/upload/table.py b/python-package/basedosdados/upload/table.py index 889c8429a..dd70d0fe9 100644 --- a/python-package/basedosdados/upload/table.py +++ b/python-package/basedosdados/upload/table.py @@ -567,6 +567,7 @@ def create( columns_config_url_or_path=None, dataset_is_public=True, location=None, + chunk_size=None, ): """Creates BigQuery table at staging dataset. @@ -626,6 +627,10 @@ def create( location (str): Optional. Location of dataset data. List of possible region names locations: https://cloud.google.com/bigquery/docs/locations + chunk_size (int): Optional + The size of a chunk of data whenever iterating (in bytes). + This must be a multiple of 256 KB per the API specification. + If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used. """ if path is None: @@ -651,7 +656,10 @@ def create( ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload( - path, mode="staging", if_exists=if_storage_data_exists + path, + mode="staging", + if_exists=if_storage_data_exists, + chunk_size=chunk_size, ) # Create Dataset if it doesn't exist @@ -835,7 +843,14 @@ def delete(self, mode): action="deleted", ) - def append(self, filepath, partitions=None, if_exists="replace", **upload_args): + def append( + self, + filepath, + partitions=None, + if_exists="replace", + chunk_size=None, + **upload_args, + ): """Appends new data to existing BigQuery table. As long as the data has the same schema. It appends the data in the @@ -854,6 +869,10 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args): * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing + chunk_size (int): Optional + The size of a chunk of data whenever iterating (in bytes). + This must be a multiple of 256 KB per the API specification. + If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used. """ if not self.table_exists("staging"): raise BaseDosDadosException( @@ -865,6 +884,7 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args): mode="staging", partitions=partitions, if_exists=if_exists, + chunk_size=chunk_size, **upload_args, ) logger.success( diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 3a1d51e23..0b71875d9 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -13,7 +13,7 @@ packages = [ ] readme = "README.md" repository = "https://github.com/base-dos-dados/bases" -version = "1.6.3-beta.2" +version = "1.6.4" [tool.poetry.scripts] basedosdados = 'basedosdados.cli.cli:cli' @@ -26,6 +26,7 @@ click = "8.0.3" google-cloud-bigquery = "2.30.1" google-cloud-bigquery-storage = "1.1.0" google-cloud-storage = "1.42.3" +importlib-metadata = "^4.11.3" ipykernel = "5.3.4" jupyter = "^1.0.0" loguru = "^0.6.0" @@ -44,7 +45,6 @@ python = ">=3.7.1,<3.11" toml = "^0.10.2" tomlkit = "0.7.0" tqdm = "4.50.2" -importlib-metadata = "^4.11.3" [tool.black] # Use the more relaxed max line length permitted in PEP8.