Skip to content

Commit

Permalink
integrated the source code of a openzim file format reader. These are
Browse files Browse the repository at this point in the history
the raw format reader files with no integration in YaCy yet, which will
maybe follow as a next step. The zim file format is documented in
https://openzim.org and the reader code was taken from the archived,
non-maintained repository at https://github.com/openzim/zimreader-java
  • Loading branch information
Orbiter committed Oct 27, 2023
1 parent 4308aa5 commit 1fefae9
Show file tree
Hide file tree
Showing 8 changed files with 1,021 additions and 0 deletions.
46 changes: 46 additions & 0 deletions source/org/openzim/ArticleEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/


package org.openzim;

public class ArticleEntry extends DirectoryEntry {

int clusterNumber;

int blobnumber;

public ArticleEntry(final int mimeType, final char namespace, final int revision,
final int clusterNumber, final int blobNumber, final String url, final String title,
final int urlListindex) {

super(mimeType, namespace, revision, url, title, urlListindex);

this.clusterNumber = clusterNumber;
this.blobnumber = blobNumber;
}

public int getClusterNumber() {
return this.clusterNumber;
}

public int getBlobnumber() {
return this.blobnumber;
}

}
69 changes: 69 additions & 0 deletions source/org/openzim/DirectoryEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/

package org.openzim;

public abstract class DirectoryEntry {

int mimeType;

char namespace;

int revision;

String url;

String title;

int urlListindex;

public DirectoryEntry(final int mimeType, final char namespace, final int revision,
final String url, final String title, final int index) {
this.mimeType = mimeType;
this.namespace = namespace;
this.revision = revision;
this.url = url;
this.title = title;
this.urlListindex = index;
}

public int getMimeType() {
return this.mimeType;
}

public char getNamespace() {
return this.namespace;
}

public int getRevision() {
return this.revision;
}

public String getUrl() {
return this.url;
}

public String getTitle() {
return this.title;
}

public int getUrlListindex() {
return this.urlListindex;
}

}
135 changes: 135 additions & 0 deletions source/org/openzim/RandomAcessFileZIMInputStream.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/

package org.openzim;

import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;

/**
* This is an implementation of RandomAccessFile to ensure that it is an
* InputStream as well, specifically designed for reading a ZIM file. Ad-Hoc
* implementation, can be improved.
*
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
*/

public class RandomAcessFileZIMInputStream extends InputStream {

private final RandomAccessFile mRAFReader;

private long mMarked = -1;

public RandomAcessFileZIMInputStream(final RandomAccessFile reader) {
this.mRAFReader = reader;
}

// TODO: Remove the parameter buffer
public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 2);
return Utilities.toTwoLittleEndianInteger(buffer);
}
}

// TODO: Remove the parameter buffer
public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 4);
return Utilities.toFourLittleEndianInteger(buffer);
}
}

// TODO: Remove the parameter buffer
public int readEightLittleEndianBytesValue(final byte[] buffer)
throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 8);
return Utilities.toEightLittleEndianInteger(buffer);
}
}

// TODO: Remove the parameter buffer
public int readSixteenLittleEndianBytesValue(final byte[] buffer)
throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 16);
return Utilities.toSixteenLittleEndianInteger(buffer);
}
}

// Reads characters from the current position into a String and stops when a
// '\0' is encountered
public String readString() throws IOException {
final StringBuffer sb = new StringBuffer();
/*
* int i; byte[] buffer = new byte[100]; while (true) {
* mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
* != buffer.length) break; } return sb.toString();
*/
int b;
b = this.mRAFReader.read();
while (b != '\0') {
sb.append((char) b);
b = this.mRAFReader.read();
}
return sb.toString();

}

@Override
public int read() throws IOException {
return this.mRAFReader.read();
}

public RandomAccessFile getRandomAccessFile() {
return this.mRAFReader;
}

public void seek(final long pos) throws IOException {
this.mRAFReader.seek(pos);
}

public long getFilePointer() throws IOException {
return this.mRAFReader.getFilePointer();
}

public void mark() throws IOException {
this.mMarked = this.mRAFReader.getFilePointer();
}

@Override
public void reset() throws IOException {
if (this.mMarked == -1) {
return;
} else {
this.mRAFReader.seek(this.mMarked);
this.mMarked = -1;
}
}
}
37 changes: 37 additions & 0 deletions source/org/openzim/RedirectEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/

package org.openzim;

public class RedirectEntry extends DirectoryEntry {

int redirectIndex;

public RedirectEntry(final int mimeType, final char namespace, final int revision,
final int redirectIndex, final String url, final String title, final int urlListindex) {

super(mimeType, namespace, revision, url, title, urlListindex);

this.redirectIndex = redirectIndex;
}

public int getRedirectIndex() {
return this.redirectIndex;
}

}
84 changes: 84 additions & 0 deletions source/org/openzim/Utilities.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/


package org.openzim;

import java.io.IOException;
import java.io.InputStream;

public class Utilities {

// TODO: Write a binary search algorithm
public static int binarySearch() {
return -1;
}

public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
return result;
}
}

public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
return result;
}
}

public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
return result;
}
}

public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
| ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
| ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
| ((buffer[12] & 0xFF) << 96)
| ((buffer[13] & 0xFF) << 104)
| ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
return result;
}
}

public static void skipFully(final InputStream stream, final long bytes) throws IOException {
for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i));
}

}
Loading

0 comments on commit 1fefae9

Please sign in to comment.