diff --git a/Dockerfile b/Dockerfile index 3381b25..9e62511 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ # tags from Docker Hub. FROM python:3.7-slim -LABEL Name=dataset-converter Version=0.1.0 +LABEL Name=dataset-converter Version=0.1.1 LABEL maintainer="Hugo Matalonga " ARG UID=1000 @@ -21,8 +21,6 @@ RUN addgroup --system --gid ${GID} user \ WORKDIR /home/user COPY ./app/requirements.txt /home/user -# Using pip: -RUN python3 -m pip install --upgrade --no-cache-dir --compile pip RUN python3 -m pip install --no-cache-dir --compile -r requirements.txt ADD ./entrypoint.sh /usr/local/bin @@ -34,4 +32,4 @@ RUN chown -R user:user /home/user USER user ENV PATH=${PATH}:/home/user/.local/bin -CMD ["/usr/local/bin/entrypoint.sh"] +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/README.md b/README.md index 5dce16e..22e213b 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,15 @@ $ docker-compose up ``` - It will look for all `.yml`, for each dataset configured file, it will produce an optimized parquet file and a pickle file containing the pandas dtypes. The generated files are located in the `./data` folder. +### Output files +For each config file found, keeps the same file `name` as set in the config and create the following files: + +#### ${name}.dytpes.pickle +Contains a dict python with the column:dtype for each entry. + +#### ${name}.parquet.7z +Creates a parquet binary file compressed in 7z format from the dataframe processed. + ## Plugins A plugin system is available, where is possible to call additional procedures to modify the dataset files. diff --git a/app/app.py b/app/app.py index d3e3d3f..750050c 100644 --- a/app/app.py +++ b/app/app.py @@ -72,7 +72,7 @@ def load_tasks(df, plugins, category): def export_files(df, name, compression): filename = name.split('.')[0] - filepath = os.path.join(data_path, filename + '.dtypes.p') + filepath = os.path.join(data_path, filename + '.dtypes.pickle') print('Creating dtypes file -> {}'.format(filepath)) save_dtypes(cache_dtypes(df), filepath)