Skip to content

Elasticsearch

Vishal edited this page May 1, 2021 · 19 revisions

We run an Elasticsearch index in a Docker container. The following mapping is the schema we to use, right now the mapping is dynamic.

{
  "assess_remap": {
    "mappings": {
      "properties": {
        "category": {
          "properties": {
            "ics": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            }
          }
        },
        "description": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "doc_number": {
          "type": "long"
        },
        "hash": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "id": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "ingestion_date": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "published_date": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "raw_id": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "sdo": {
          "properties": {
            "abbreviation": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "data": {
              "properties": {
                "code": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "edition": {
                  "type": "long"
                },
                "field": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "group": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "number_of_pages": {
                  "type": "long"
                },
                "preview_url": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "section_titles": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "sections": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "subgroup": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                },
                "type": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword",
                      "ignore_above": 256
                    }
                  }
                }
              }
            }
          }
        },
        "status": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "technical_committee": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "text": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "title": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "url": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        }
      }
    }
  }
}

Here is a dummy example:

{
        "id": "x0288b9ed144439f8ad8fa017d604eac",
        "raw_id": "ISO 44-2:2015",
        "description": "ISO 000 is a dummy standard I am adding that is made up.",
        "ingestion_date": "2018-03-10 13:07:45",
        "hash": "7c8dc19cfbb38a573090c4b0b2c6d3b4f4d68f98ed55506aed936f78cfc71590",
        "published_date": "2020-12",
        "isbn": null,
        "text": [
            "description"
        ],
        "status": "TO_DELETE",
        "technical_committee": "ISO/TC 1111",
        "title": "This is dummy data",
        "url": "https://www.iso.org/standard/123123.html",
        "category": {
            "ics": "['43.060.20']"
        },
        "sdo": {
            "abbreviation": "iso",
            "data": {
                "code": "0.30.010.10.ISO/IEC 10592:1992",
                "edition": [
                    2
                ],
                "field": "1",
                "group": "1.1",
                "number_of_pages": [
                    1
                ],
                "preview_url": "https://www.iso.org/obp/ui/#!iso:std:123123:en",
                "section_titles": [
                    "Foreword",
                    "Introduction",
                    "1   Scope",
                    "2   Normative references",
                    "3   Terms and definitions"
                ],
                "sections": null,
                "subgroup": "1.1.1",
                "type": "standard"
            }
        }
    }
}

Glossary

  • "id": A UUID generated by us that serves as a unique primary key.
  • "raw_id": The raw id scraped
  • "doc_number": The generated document number referring to the order in which the standard was ingested.
  • "title": Title of the standard.
  • "published_date": Date published, this format is scraped and kept in its raw form.
  • "ingestion_date": Date of ingestion in Elasticsearch date format, YYYY-M-D HH:MM:SS.
  • "hash": SHA256 hash of the whole file (to check for quick diffs in file). These take time to compute and are dependent on whether we can reach the whole document.
  • "description": Description of standard.
  • "isbn": (International Standard Book Number) [this is unique for each publication or revision etc].
  • "status": The status embedded in the standard (active, withdrawn, etc..)
  • "technical_committee": embedded in the standard.
  • "text": Tells us which text fields to use for the similarity analysis.
  • "url": URL location of the standard.
  • "category": Designated by the standard metadata, a dictionary of the category.
  • "sdo": SDO/SSO (Standard Developing Organization) a.k.a. SSO (Standard Setting Organization). This is a dictionary of SDO specific terms under "data", as well as a key called abbreviation.
Clone this wiki locally