Skip to content

Commit

Permalink
Merge branch 'feature/thumbcache-parser' of https://github.com/sepinf…
Browse files Browse the repository at this point in the history
…-inc/IPED into feature/thumbcache-parser

# Conflicts:
#	iped-parsers/iped-parsers-impl/src/main/java/iped/parsers/misc/ThumbcacheParser.java
  • Loading branch information
marcus6n committed Oct 18, 2024
2 parents cafee08 + d581755 commit a048449
Showing 1 changed file with 27 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
Expand All @@ -18,8 +21,6 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Set;

public class ThumbcacheParser extends AbstractParser {
Expand All @@ -32,63 +33,41 @@ public Set<MediaType> getSupportedTypes(ParseContext context) {
}

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
new ParsingEmbeddedDocumentExtractor(context));
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();

TemporaryResources tmp = new TemporaryResources();
try (TikaInputStream tis = TikaInputStream.get(stream, tmp)) {
File file = tis.getFile();
try (POIFSFileSystem poiFS = new POIFSFileSystem(file)) {
recurseDir(poiFS.getRoot(), xhtml);
}
} finally {
xhtml.endDocument();
tmp.close();
}
TikaInputStream tis = TikaInputStream.get(stream, tmp);
File file = tis.getFile();
POIFSFileSystem poiFS = new POIFSFileSystem(file);

// Placeholder for the recursive method that will navigate through directories and process files extracting metadata and image.
// TODO: Implement recurseDir method
recurseDir(poiFS.getRoot(), extractor, xhtml);

xhtml.endDocument();
tmp.close();
}

private void recurseDir(DirectoryNode dir, XHTMLContentHandler xhtml) throws SAXException {
private void recurseDir(DirectoryNode dir, EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
for (Entry entry : dir) {
if (entry instanceof DirectoryNode) {
recurseDir((DirectoryNode) entry, xhtml);
recurseDir((DirectoryNode) entry, extractor, xhtml);
} else {
try (DocumentInputStream docStream = new DocumentInputStream((DocumentNode) entry)) {
byte[] buffer = new byte[80];
int bytesRead = docStream.read(buffer);
if (bytesRead != buffer.length) {
continue;
}

ByteBuffer bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);

int size = bb.getInt();
long entryHash = bb.getLong();
int identifierStringSize = bb.getInt();
int paddingSize = bb.getInt();
int dataSize = bb.getInt();
int unknown1 = bb.getInt();
long dataChecksum = bb.getInt() & 0xFFFFFFFFL;
long headerChecksum = bb.getLong();

xhtml.startElement("pre");

xhtml.characters("size : " + size + "\n");
xhtml.characters("entry hash : 0x" + Long.toHexString(entryHash) + "\n");
xhtml.characters("identifier string size : " + identifierStringSize + "\n");
xhtml.characters("padding size : " + paddingSize + "\n");
xhtml.characters("data size : " + dataSize + "\n");
xhtml.characters("unknown1 : 0x" + Integer.toHexString(unknown1) + "\n");
xhtml.characters("data checksum : 0x" + Long.toHexString(dataChecksum) + "\n");
xhtml.characters("header checksum : 0x" + Long.toHexString(headerChecksum) + "\n");

xhtml.characters("\nidentifier string : " + Long.toHexString(entryHash) + "\n");

xhtml.endElement("pre");
Metadata entrydata = new Metadata();
entrydata.set(Metadata.CONTENT_TYPE, "thumbcache-entry");

} catch (IOException e) {
xhtml.characters("Error processing entry: " + entry.getName() + "\n");
xhtml.startElement("div", "class", "thumbcache-entry");
xhtml.element("h1", entry.getName());
xhtml.startElement("div", "class", "thumbcache-entry-content");
try (InputStream stream = new DocumentInputStream((DocumentNode) entry)) {
extractor.parseEmbedded(stream, xhtml, entrydata, true);
}
xhtml.endElement("div");
xhtml.endElement("div");
}
}
}
Expand Down

0 comments on commit a048449

Please sign in to comment.