-
Notifications
You must be signed in to change notification settings - Fork 31
Metadata
Matthew Caruana Galizia edited this page Jan 7, 2017
·
3 revisions
Unless you disable the --outputMetadata
option when using the spew
command, Extract adds all the metadata fields extracted by Tika from each file parsed.
When outputting to Solr, a few extra fields are added that making working with the index easier. These are as follows.
-
extract_id
: a unique ID for the document, a hash digest of the file by default. -
extract_base_type
: theContent-Type
without any parameters. Useful for file type based faceting. -
extract_paths
: the original file path. This field is multivalued when using a file hash digest as the method for calculating IDs. -
extract_parent_paths
: the file's parent path. Useful for drill-down faceting when combined with Solr'sPathHierarchyTokenizerFactory
. While faceting on thepath
field is technically possible, it's not desirable because you'll always get a facet for the file itself, with a document count of one.
When outputting to Solr, all metadata field names are converted to lowercase, prefixed with tika_
, and non-alphanumeric characters are converted to underscores.
The following is an example of how to define the metadata fields in your schema, using path tokenisation on the parent_path
and path
fields. You will then be able to use drill-down faceting on the parent_path
field in a client UI to navigate a hierarchy and view results per-directory.
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="test" version="1.5">
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true" />
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0" />
<fieldType name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.StandardFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.StopFilterFactory" />
<filter class="solr.PorterStemFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="descendant_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
</fieldType>
</types>
<fields>
<field name="_version_" type="long" indexed="true" stored="true" />
<field name="extract_id" type="string" indexed="true" stored="true" multiValued="false" required="true" />
<field name="extract_base_type" type="string" indexed="true" stored="true" />
<field name="extract_paths" type="descendant_path" indexed="true" stored="true" multiValued="true" />
<field name="extract_parent_paths" type="descendant_path" indexed="true" stored="true" multiValued="true" />
<field name="text" type="text" indexed="true" stored="false" />
<!-- Main body of document. NOTE: This field is not indexed by default, since it is also copied to "text" using copyField below. This is to save space. Use this field for returning and highlighting document content. Use the "text" field to search the content. -->
<field name="tika_content" type="text" indexed="false" stored="true" />
<!-- Dynamic fields for arbitrary metadata. -->
<dynamicField name="metadata_*" type="string" indexed="true" stored="true" />
</fields>
<defaultSearchField>text</defaultSearchField>
<!-- Field to use to determine and enforce document uniqueness. -->
<uniqueKey>extract_id</uniqueKey>
<!-- Text field to search by default. -->
<copyField source="tika_content" dest="text" />
</schema>