Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate CSV from parquet files #19

Open
alexgarel opened this issue Dec 17, 2024 · 2 comments
Open

Generate CSV from parquet files #19

alexgarel opened this issue Dec 17, 2024 · 2 comments
Assignees

Comments

@alexgarel
Copy link
Member

We want to avoid doing all the work to generate the CSV on product opener instance.

@alexgarel
Copy link
Member Author

@jeremyarancio I know you are working on this but I created the issue :-)

@jeremyarancio
Copy link
Collaborator

jeremyarancio commented Dec 17, 2024

Conversion is going!

I just attacked image_url's fields in the CSV, and I noticed "rev" was missing in the Parquet dataset
Here are the URLs in the CSV:

code = 4061461479824
image_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/front_de.37.400.jpg
image_small_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/front_de.37.200.jpg
image_ingredients_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/ingredients_de.9.400.jpg
image_ingredients_small_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/ingredients_de.9.200.jpg
image_nutrition_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/nutrition_de.36.400.jpg
image_nutrition_small_url = https://images.openfoodfacts.org/images/products/406/146/147/9824/nutrition_de.36.200.jpg

The rev we're interested in is hidden into the images field in the JSONL file, but is missing in the Parquet.
You would note that is doesn't correspond to the rev as a field (in the example equal to 40)

A PR to Parquet export will be added to fix it.

For more info about image urls, check the doc

From the JSONL
select rev, images from jsonl where code = '4061461479824';

rev = 40
images = {
    "1": {
        "uploader": "kiliweb",
        "uploaded_t": 1628238343,
        "sizes": {
            "100": {"w": 50, "h": 100},
            "full": {"h": 1200, "w": 601},
            "400": {"h": 400, "w": 200},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "packaging_de": {
        "uploader": null,
        "uploaded_t": null,
        "sizes": {
            "100": {"w": 100, "h": 71},
            "full": {"h": 848, "w": 1190},
            "400": {"h": 285, "w": 400},
            "200": {"w": 200, "h": 143}
        },
        "x1": "-1",
        "angle": 0,
        "normalize": null,
        "coordinates_image_size": "full",
        "imgid": "4",
        "y1": "-1",
        "x2": "-1",
        "geometry": "0x0--1--1",
        "y2": "-1",
        "white_magic": null,
        "rev": "12",
        "ocr": null,
        "orientation": null
    },
    "front_de": {
        "uploader": null,
        "uploaded_t": null,
        "sizes": {
            "100": {"w": 50, "h": 100},
            "full": {"h": 1200, "w": 601},
            "400": {"h": 400, "w": 200},
            "200": {"w": 100, "h": 200}
        },
        "x1": "0",
        "angle": "0",
        "normalize": null,
        "coordinates_image_size": "full",
        "imgid": "1",
        "y1": "0",
        "x2": "601",
        "geometry": "601x1200-0-0",
        "y2": "1200",
        "white_magic": null,
        "rev": "37",
        "ocr": null,
        "orientation": null
    },
    "2": {
        "uploader": "kiliweb",
        "uploaded_t": 1628238343,
        "sizes": {
            "100": {"w": 70, "h": 100},
            "full": {"h": 1200, "w": 839},
            "400": {"h": 400, "w": 280},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "ingredients_de": {
        "uploader": null,
        "uploaded_t": null,
        "sizes": {
            "100": {"w": 100, "h": 18},
            "full": {"h": 536, "w": 3024},
            "400": {"h": 71, "w": 400},
            "200": {"w": 200, "h": 35}
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": "400",
        "imgid": "3",
        "y1": null,
        "x2": null,
        "geometry": "0x0-0-0",
        "y2": null,
        "white_magic": null,
        "rev": "9",
        "ocr": null,
        "orientation": null
    },
    "3": {
        "uploader": "gehrmaja",
        "uploaded_t": 1628588851,
        "sizes": {
            "100": {"w": 100, "h": 18},
            "full": {"h": 536, "w": 3024},
            "400": {"h": 71, "w": 400},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "6": {
        "uploader": "femmenoire",
        "uploaded_t": 1701512789,
        "sizes": {
            "100": {"w": 100, "h": 88},
            "full": {"h": 1117, "w": 1274},
            "400": {"h": 351, "w": 400},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "5": {
        "uploader": "prepperapp",
        "uploaded_t": 1690095501,
        "sizes": {
            "100": {"w": 100, "h": 100},
            "full": {"h": 800, "w": 800},
            "400": {"h": 400, "w": 400},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "4": {
        "uploader": "gehrmaja",
        "uploaded_t": 1628588949,
        "sizes": {
            "100": {"w": 100, "h": 71},
            "full": {"h": 848, "w": 1190},
            "400": {"h": 285, "w": 400},
            "200": null
        },
        "x1": null,
        "angle": null,
        "normalize": null,
        "coordinates_image_size": null,
        "imgid": null,
        "y1": null,
        "x2": null,
        "geometry": null,
        "y2": null,
        "white_magic": null,
        "rev": null,
        "ocr": null,
        "orientation": null
    },
    "nutrition_de": {
        "uploader": null,
        "uploaded_t": null,
        "sizes": {
            "100": {"w": 100, "h": 88},
            "full": {"h": 1117, "w": 1274},
            "400": {"h": 351, "w": 400},
            "200": {"w": 200, "h": 175}
        },
        "x1": "-1",
        "angle": 0,
        "normalize": null,
        "coordinates_image_size": "full",
        "imgid": "6",
        "y1": "-1",
        "x2": "-1",
        "geometry": "0x0--1--1",
        "y2": "-1",
        "white_magic": null,
        "rev": "36",
        "ocr": null,
        "orientation": null
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants