diff --git a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/Cardinal.java b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/Cardinal.java index 226407d4..6dca9301 100644 --- a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/Cardinal.java +++ b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/Cardinal.java @@ -24,7 +24,7 @@ import org.apache.jena.sparql.expr.NodeValue; import org.apache.jena.sparql.function.FunctionBase1; import org.apache.jena.sparql.util.FmtUtils; -import org.semarglproject.vocab.XSD; +import io.github.sparqlanything.html.org.semarglproject.vocab.XSD; public class Cardinal extends FunctionBase1 implements FXFunction { diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsParser.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsParser.java new file mode 100644 index 00000000..7a96542d --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsParser.java @@ -0,0 +1,407 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.QuadSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; + +import java.util.BitSet; + +/** + * Implementation of streaming NQuads parser. + *
+ * List of supported options: + * + */ +public final class NQuadsParser extends Pipe implements CharSink { + + /** + * Class URI for errors produced by a parser + */ + public static final String ERROR = "http://semarglproject.org/nquads/Error"; + + private static final short PARSING_OUTSIDE = 0; + private static final short PARSING_URI = 1; + private static final short PARSING_BNODE = 2; + private static final short PARSING_LITERAL = 3; + private static final short PARSING_AFTER_LITERAL = 4; + private static final short PARSING_LITERAL_TYPE = 5; + private static final short PARSING_COMMENT = 6; + + private static final short OBJECT_NON_LITERAL = 0; + private static final short OBJECT_PLAIN_LITERAL = 1; + private static final short OBJECT_TYPED_LITERAL = 2; + + private static final char SENTENCE_END = '.'; + + /** + * NQuads whitespace char checker + */ + private static final BitSet WHITESPACE = new BitSet(); + + static { + WHITESPACE.set('\t'); + WHITESPACE.set(' '); + WHITESPACE.set('\r'); + WHITESPACE.set('\n'); + } + + + private String subj = null; + private String pred = null; + private String literal = null; + private String literalType = null; // type or lang for non-plain literals + private byte quadType = -1; + + private io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler processorGraphHandler = null; + private boolean ignoreErrors = false; + private boolean skipSentence = false; + + private short parsingState; + + private int tokenStartPos; + private short charsToEscape = 0; + private boolean waitingForSentenceEnd = false; + private StringBuilder addBuffer = null; + + private NQuadsParser(QuadSink sink) { + super(sink); + } + + /** + * Creates instance of NQuadsParser connected to specified sink. + * @param sink sink to be connected to + * @return instance of NQuadsParser + */ + public static CharSink connect(QuadSink sink) { + return new NQuadsParser(sink); + } + + private void error(String msg) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (processorGraphHandler != null) { + processorGraphHandler.error(ERROR, msg); + } + if (!ignoreErrors) { + throw new io.github.sparqlanything.html.org.semarglproject.rdf.ParseException(msg); + } else { + resetQuad(); + skipSentence = true; + parsingState = PARSING_OUTSIDE; + } + } + + @Override + public NQuadsParser process(String str) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + return process(str.toCharArray(), 0, str.length()); + } + + @Override + public NQuadsParser process(char ch) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + char[] buffer = new char[1]; + buffer[0] = ch; + return process(buffer, 0, 1); + } + + @Override + public NQuadsParser process(char[] buffer, int start, int count) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (tokenStartPos != -1) { + tokenStartPos = start; + } + int end = start + count; + + for (int pos = start; pos < end; pos++) { + if (skipSentence && buffer[pos] != SENTENCE_END) { + continue; + } else { + skipSentence = false; + } + + if (parsingState == PARSING_OUTSIDE) { + processOutsideChar(buffer, pos); + } else if (parsingState == PARSING_COMMENT) { + if (buffer[pos] == '\n' || buffer[pos] == '\r') { + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_URI) { + if (buffer[pos] == '>') { + onNonLiteral(unescape(extractToken(buffer, pos, 1))); + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_BNODE) { + if (WHITESPACE.get(buffer[pos]) || buffer[pos] == SENTENCE_END) { + onNonLiteral(extractToken(buffer, pos - 1, 0)); + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_LITERAL) { + processLiteralChar(buffer, pos); + } else if (parsingState == PARSING_AFTER_LITERAL) { + if (buffer[pos] == '@' || buffer[pos] == '^') { + tokenStartPos = pos; + parsingState = PARSING_LITERAL_TYPE; + } else if (WHITESPACE.get(buffer[pos]) || buffer[pos] == '<') { + onPlainLiteral(literal, null); + parsingState = PARSING_OUTSIDE; + processOutsideChar(buffer, pos); + } else { + error("Unexpected character '" + buffer[pos] + "' after literal in string '" + new String(buffer) + "'"); + } + } else if (parsingState == PARSING_LITERAL_TYPE) { + processLiteralTypeChar(buffer, pos); + } + } + if (tokenStartPos != -1) { + if (addBuffer == null) { + addBuffer = new StringBuilder(); + } + addBuffer.append(buffer, tokenStartPos, end - tokenStartPos); + } + return this; + } + + private void processLiteralChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (charsToEscape == 9 && buffer[pos] == 'u') { + charsToEscape -= 5; + } else if (charsToEscape == 9 && buffer[pos] != 'U') { + charsToEscape = 0; + } else if (charsToEscape > 0) { + charsToEscape--; + } else { + if (buffer[pos] == '\"') { + literal = unescape(extractToken(buffer, pos, 1)); + parsingState = PARSING_AFTER_LITERAL; + } else if (buffer[pos] == '\\') { + charsToEscape = 9; + } + } + } + + private void processLiteralTypeChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (WHITESPACE.get(buffer[pos])) { + String type = extractToken(buffer, pos, 0); + int trimSize = type.charAt(type.length() - 1) == SENTENCE_END ? 1 : 0; + if (type.charAt(0) == '@') { + onPlainLiteral(literal, type.substring(1, type.length() - 1 - trimSize)); + } else if (type.startsWith("^^<") && type.charAt(type.length() - 2) == '>') { + onTypedLiteral(literal, type.substring(3, type.length() - 2 - trimSize)); + } else { + error("Literal type '" + type + "' can not be parsed"); + } + parsingState = PARSING_OUTSIDE; + if (trimSize > 0) { + finishSentence(); + } + } + } + + private void processOutsideChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + switch (buffer[pos]) { + case '\"': + parsingState = PARSING_LITERAL; + tokenStartPos = pos; + break; + case '<': + parsingState = PARSING_URI; + tokenStartPos = pos; + break; + case '_': + parsingState = PARSING_BNODE; + tokenStartPos = pos; + break; + case '#': + parsingState = PARSING_COMMENT; + break; + case SENTENCE_END: + finishSentence(); + break; + default: + if (!WHITESPACE.get(buffer[pos])) { + error("Unexpected character '" + buffer[pos] + "'"); + } + } + } + + private void finishSentence() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (waitingForSentenceEnd) { + waitingForSentenceEnd = false; + } else { + error("Unexpected end of sentence"); + } + } + + private void onNonLiteral(String uri) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (waitingForSentenceEnd) { + error("End of sentence expected"); + } + if (subj == null) { + subj = uri; + } else if (pred == null) { + pred = uri; + } else if (literal == null) { + literal = uri; + quadType = OBJECT_NON_LITERAL; + } else { + onGraph(uri); + } + } + + private void onPlainLiteral(String value, String lang) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + literal = value; + literalType = lang; + quadType = OBJECT_PLAIN_LITERAL; + } + + private void onTypedLiteral(String value, String type) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + literal = value; + literalType = type; + quadType = OBJECT_TYPED_LITERAL; + } + + private void onGraph(String value) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (quadType == OBJECT_PLAIN_LITERAL) { + sink.addPlainLiteral(subj, pred, literal, literalType, value); + } else if (quadType == OBJECT_TYPED_LITERAL) { + sink.addTypedLiteral(subj, pred, literal, literalType, value); + } else if (quadType == OBJECT_NON_LITERAL) { + sink.addNonLiteral(subj, pred, literal, value); + } + resetQuad(); + } + + @Override + public void setBaseUri(String baseUri) { + } + + @Override + protected boolean setPropertyInternal(String key, Object value) { + if (StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY.equals(key) && value instanceof io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler) { + processorGraphHandler = (ProcessorGraphHandler) value; + } else if (StreamProcessor.ENABLE_ERROR_RECOVERY.equals(key) && value instanceof Boolean) { + ignoreErrors = (Boolean) value; + } + return false; + } + + private String extractToken(char[] buffer, int tokenEndPos, int trimSize) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + String saved; + if (addBuffer != null) { + if (tokenEndPos - trimSize >= tokenStartPos) { + addBuffer.append(buffer, tokenStartPos, tokenEndPos - tokenStartPos - trimSize + 1); + } + addBuffer.delete(0, trimSize); + saved = addBuffer.toString(); + addBuffer = null; + } else { + saved = String.valueOf(buffer, tokenStartPos + trimSize, tokenEndPos - tokenStartPos + 1 - 2 * trimSize); + } + tokenStartPos = -1; + return saved; + } + + @Override + public void startStream() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + super.startStream(); + resetQuad(); + waitingForSentenceEnd = false; + parsingState = PARSING_OUTSIDE; + } + + private void resetQuad() { + addBuffer = null; + tokenStartPos = -1; + subj = null; + pred = null; + literal = null; + literalType = null; + quadType = -1; + waitingForSentenceEnd = true; + } + + @Override + public void endStream() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (tokenStartPos != -1 || waitingForSentenceEnd) { + error("Unexpected end of stream"); + } + super.endStream(); + } + + private String unescape(String str) throws ParseException { + int limit = str.length(); + StringBuilder result = new StringBuilder(limit); + + for (int i = 0; i < limit; i++) { + char ch = str.charAt(i); + if (ch != '\\') { + result.append(ch); + continue; + } + i++; + if (i == limit) { + break; + } + ch = str.charAt(i); + switch (ch) { + case '\\': + case '\'': + case '\"': + result.append(ch); + break; + case 'b': + result.append('\b'); + break; + case 'f': + result.append('\f'); + break; + case 'n': + result.append('\n'); + break; + case 'r': + result.append('\r'); + break; + case 't': + result.append('\t'); + break; + case 'u': + case 'U': + int sequenceLength = ch == 'u' ? 4 : 8; + if (i + sequenceLength >= limit) { + error("Error parsing escape sequence '\\" + ch + "'"); + } + String code = str.substring(i + 1, i + 1 + sequenceLength); + i += sequenceLength; + + try { + int value = Integer.parseInt(code, 16); + result.append((char) value); + } catch (NumberFormatException nfe) { + error("Error parsing escape sequence '\\" + ch + "'"); + } + break; + default: + result.append(ch); + break; + } + } + return result.toString(); + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsSerializer.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsSerializer.java new file mode 100644 index 00000000..9f2ca840 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NQuadsSerializer.java @@ -0,0 +1,90 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.NTriplesSerializer; +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.QuadSink; + +/** + * Implementation of {@link io.github.sparqlanything.html.org.semarglproject.sink.TripleSink} which serializes triples to + * {@link CharSink} using NTriples syntax. + */ +public class NQuadsSerializer extends NTriplesSerializer implements QuadSink { + + private NQuadsSerializer(CharSink sink) { + super(sink); + } + + /** + * Creates instance of TurtleSerializer connected to specified sink. + * @param sink sink to be connected to + * @return instance of TurtleSerializer + */ + public static QuadSink connect(CharSink sink) { + return new NQuadsSerializer(sink); + } + + @Override + public void addNonLiteral(String subj, String pred, String obj, String graph) { + try { + startTriple(subj, pred); + serializeBnodeOrUri(obj); + if (graph != null) { + serializeBnodeOrUri(graph); + } + sink.process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang, String graph) { + try { + startTriple(subj, pred); + addContent(content); + if (lang != null) { + sink.process('@').process(lang); + } + sink.process(SPACE); + if (graph != null) { + serializeBnodeOrUri(graph); + } + sink.process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type, String graph) { + try { + startTriple(subj, pred); + addContent(content); + sink.process("^^"); + serializeUri(type); + if (graph != null) { + serializeBnodeOrUri(graph); + } + sink.process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesParser.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesParser.java new file mode 100644 index 00000000..2fdee049 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesParser.java @@ -0,0 +1,397 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; + +import java.util.BitSet; + +/** + * Implementation of streaming NTriples parser. + *
+ * List of supported options: + * + */ +public final class NTriplesParser extends Pipe implements CharSink { + + /** + * Class URI for errors produced by a parser + */ + public static final String ERROR = "http://semarglproject.org/ntriples/Error"; + + private static final short PARSING_OUTSIDE = 0; + private static final short PARSING_URI = 1; + private static final short PARSING_BNODE = 2; + private static final short PARSING_LITERAL = 3; + private static final short PARSING_AFTER_LITERAL = 4; + private static final short PARSING_LITERAL_TYPE = 5; + private static final short PARSING_COMMENT = 6; + + private static final char SENTENCE_END = '.'; + + /** + * NTriples whitespace char checker + */ + private static final BitSet WHITESPACE = new BitSet(); + + static { + WHITESPACE.set('\t'); + WHITESPACE.set(' '); + WHITESPACE.set('\r'); + WHITESPACE.set('\n'); + } + + + private String subj = null; + private String pred = null; + private String literalObj = null; + + private io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler processorGraphHandler = null; + private boolean ignoreErrors = false; + private boolean skipSentence = false; + + private short parsingState; + + private int tokenStartPos; + private short charsToEscape = 0; + private boolean waitingForSentenceEnd = false; + private StringBuilder addBuffer = null; + + private NTriplesParser(TripleSink sink) { + super(sink); + } + + /** + * Creates instance of NTriplesParser connected to specified sink. + * @param sink sink to be connected to + * @return instance of NTriplesParser + */ + public static CharSink connect(TripleSink sink) { + return new NTriplesParser(sink); + } + + private void error(String msg) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (processorGraphHandler != null) { + processorGraphHandler.error(ERROR, msg); + } + if (!ignoreErrors) { + throw new io.github.sparqlanything.html.org.semarglproject.rdf.ParseException(msg); + } else { + resetTriple(); + skipSentence = true; + parsingState = PARSING_OUTSIDE; + } + } + + @Override + public NTriplesParser process(String str) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + return process(str.toCharArray(), 0, str.length()); + } + + @Override + public NTriplesParser process(char ch) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + char[] buffer = new char[1]; + buffer[0] = ch; + return process(buffer, 0, 1); + } + + @Override + public NTriplesParser process(char[] buffer, int start, int count) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (tokenStartPos != -1) { + tokenStartPos = start; + } + int end = start + count; + + for (int pos = start; pos < end; pos++) { + if (skipSentence && buffer[pos] != SENTENCE_END) { + continue; + } else { + skipSentence = false; + } + + if (parsingState == PARSING_OUTSIDE) { + processOutsideChar(buffer, pos); + } else if (parsingState == PARSING_COMMENT) { + if (buffer[pos] == '\n' || buffer[pos] == '\r') { + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_URI) { + if (buffer[pos] == '>') { + onNonLiteral(unescape(extractToken(buffer, pos, 1))); + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_BNODE) { + if (WHITESPACE.get(buffer[pos]) || buffer[pos] == SENTENCE_END) { + onNonLiteral(extractToken(buffer, pos - 1, 0)); + parsingState = PARSING_OUTSIDE; + } + } else if (parsingState == PARSING_LITERAL) { + processLiteralChar(buffer, pos); + } else if (parsingState == PARSING_AFTER_LITERAL) { + if (buffer[pos] == '@' || buffer[pos] == '^') { + tokenStartPos = pos; + parsingState = PARSING_LITERAL_TYPE; + } else if (WHITESPACE.get(buffer[pos]) || buffer[pos] == SENTENCE_END) { + onPlainLiteral(literalObj, null); + parsingState = PARSING_OUTSIDE; + processOutsideChar(buffer, pos); + } else { + error("Unexpected character '" + buffer[pos] + "' after literal"); + } + } else if (parsingState == PARSING_LITERAL_TYPE) { + processLiteralTypeChar(buffer, pos); + } + } + if (tokenStartPos != -1) { + if (addBuffer == null) { + addBuffer = new StringBuilder(); + } + addBuffer.append(buffer, tokenStartPos, end - tokenStartPos); + } + return this; + } + + private void processLiteralChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (charsToEscape == 9 && buffer[pos] == 'u') { + charsToEscape -= 5; + } else if (charsToEscape == 9 && buffer[pos] != 'U') { + charsToEscape = 0; + } else if (charsToEscape > 0) { + charsToEscape--; + } else { + if (buffer[pos] == '\"') { + literalObj = unescape(extractToken(buffer, pos, 1)); + parsingState = PARSING_AFTER_LITERAL; + } else if (buffer[pos] == '\\') { + charsToEscape = 9; + } + } + } + + private void processLiteralTypeChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (WHITESPACE.get(buffer[pos])) { + String type = extractToken(buffer, pos, 0); + int trimSize = type.charAt(type.length() - 1) == SENTENCE_END ? 1 : 0; + if (type.charAt(0) == '@') { + onPlainLiteral(literalObj, type.substring(1, type.length() - 1 - trimSize)); + } else if (type.startsWith("^^<") && type.charAt(type.length() - 2) == '>') { + onTypedLiteral(literalObj, type.substring(3, type.length() - 2 - trimSize)); + } else { + error("Literal type '" + type + "' can not be parsed"); + } + parsingState = PARSING_OUTSIDE; + if (trimSize > 0) { + finishSentence(); + } + } + } + + private void processOutsideChar(char[] buffer, int pos) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + switch (buffer[pos]) { + case '\"': + parsingState = PARSING_LITERAL; + tokenStartPos = pos; + break; + case '<': + parsingState = PARSING_URI; + tokenStartPos = pos; + break; + case '_': + parsingState = PARSING_BNODE; + tokenStartPos = pos; + break; + case '#': + parsingState = PARSING_COMMENT; + break; + case SENTENCE_END: + finishSentence(); + break; + default: + if (!WHITESPACE.get(buffer[pos])) { + error("Unexpected character '" + buffer[pos] + "'"); + } + } + } + + private void finishSentence() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (waitingForSentenceEnd) { + waitingForSentenceEnd = false; + } else { + error("Unexpected end of sentence"); + } + } + + private void onNonLiteral(String uri) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (waitingForSentenceEnd) { + error("End of sentence expected"); + } + if (subj == null) { + subj = uri; + } else if (pred == null) { + pred = uri; + } else { + sink.addNonLiteral(subj, pred, uri); + resetTriple(); + } + } + + private void onPlainLiteral(String value, String lang) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (subj == null || pred == null) { + if (waitingForSentenceEnd) { + error("End of sentence expected"); + } else { + error("Literal is not an object"); + } + } + sink.addPlainLiteral(subj, pred, value, lang); + resetTriple(); + } + + private void onTypedLiteral(String value, String type) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (subj == null || pred == null) { + if (waitingForSentenceEnd) { + error("End of sentence expected"); + } else { + error("Literal is not an object"); + } + } + sink.addTypedLiteral(subj, pred, value, type); + resetTriple(); + } + + @Override + public void setBaseUri(String baseUri) { + } + + @Override + protected boolean setPropertyInternal(String key, Object value) { + if (StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY.equals(key) && value instanceof io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler) { + processorGraphHandler = (ProcessorGraphHandler) value; + } else if (StreamProcessor.ENABLE_ERROR_RECOVERY.equals(key) && value instanceof Boolean) { + ignoreErrors = (Boolean) value; + } + return false; + } + + private String extractToken(char[] buffer, int tokenEndPos, int trimSize) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + String saved; + if (addBuffer != null) { + if (tokenEndPos - trimSize >= tokenStartPos) { + addBuffer.append(buffer, tokenStartPos, tokenEndPos - tokenStartPos - trimSize + 1); + } + addBuffer.delete(0, trimSize); + saved = addBuffer.toString(); + addBuffer = null; + } else { + saved = String.valueOf(buffer, tokenStartPos + trimSize, tokenEndPos - tokenStartPos + 1 - 2 * trimSize); + } + tokenStartPos = -1; + return saved; + } + + @Override + public void startStream() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + super.startStream(); + resetTriple(); + waitingForSentenceEnd = false; + parsingState = PARSING_OUTSIDE; + } + + private void resetTriple() { + addBuffer = null; + tokenStartPos = -1; + subj = null; + pred = null; + waitingForSentenceEnd = true; + } + + @Override + public void endStream() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (tokenStartPos != -1 || waitingForSentenceEnd) { + error("Unexpected end of stream"); + } + super.endStream(); + } + + private String unescape(String str) throws ParseException { + int limit = str.length(); + StringBuilder result = new StringBuilder(limit); + + for (int i = 0; i < limit; i++) { + char ch = str.charAt(i); + if (ch != '\\') { + result.append(ch); + continue; + } + i++; + if (i == limit) { + break; + } + ch = str.charAt(i); + switch (ch) { + case '\\': + case '\'': + case '\"': + result.append(ch); + break; + case 'b': + result.append('\b'); + break; + case 'f': + result.append('\f'); + break; + case 'n': + result.append('\n'); + break; + case 'r': + result.append('\r'); + break; + case 't': + result.append('\t'); + break; + case 'u': + case 'U': + int sequenceLength = ch == 'u' ? 4 : 8; + if (i + sequenceLength >= limit) { + error("Error parsing escape sequence '\\" + ch + "'"); + } + String code = str.substring(i + 1, i + 1 + sequenceLength); + i += sequenceLength; + + try { + int value = Integer.parseInt(code, 16); + result.append((char) value); + } catch (NumberFormatException nfe) { + error("Error parsing escape sequence '\\" + ch + "'"); + } + break; + default: + result.append(ch); + break; + } + } + return result.toString(); + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesSerializer.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesSerializer.java new file mode 100644 index 00000000..9077457b --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/NTriplesSerializer.java @@ -0,0 +1,228 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; + +import java.util.BitSet; + +/** + * Implementation of {@link TripleSink} which serializes triples to + * {@link CharSink} using NTriples syntax. + */ +public class NTriplesSerializer extends Pipe implements TripleSink { + + protected static final String DOT_EOL = ".\n"; + protected static final char SPACE = ' '; + + private static final char QUOTE = '"'; + private static final char URI_START = '<'; + private static final char URI_END = '>'; + + private static final BitSet ESCAPABLE_CONTENT_CHARS = new BitSet(); + private static final BitSet ESCAPABLE_URI_CHARS = new BitSet(); + + static { + ESCAPABLE_CONTENT_CHARS.set('\\'); + ESCAPABLE_CONTENT_CHARS.set('\"'); + ESCAPABLE_CONTENT_CHARS.set('\b'); + ESCAPABLE_CONTENT_CHARS.set('\f'); + ESCAPABLE_CONTENT_CHARS.set('\n'); + ESCAPABLE_CONTENT_CHARS.set('\r'); + ESCAPABLE_CONTENT_CHARS.set('\t'); + + for (char ch = 0; ch <= 0x20; ch++) { + ESCAPABLE_URI_CHARS.set(ch); + } + ESCAPABLE_URI_CHARS.set('\\'); + ESCAPABLE_URI_CHARS.set('<'); + ESCAPABLE_URI_CHARS.set('>'); + ESCAPABLE_URI_CHARS.set('{'); + ESCAPABLE_URI_CHARS.set('}'); + ESCAPABLE_URI_CHARS.set('"'); + ESCAPABLE_URI_CHARS.set('`'); + ESCAPABLE_URI_CHARS.set('|'); + ESCAPABLE_URI_CHARS.set('^'); + } + + protected NTriplesSerializer(CharSink sink) { + super(sink); + } + + /** + * Creates instance of TurtleSerializer connected to specified sink. + * @param sink sink to be connected to + * @return instance of TurtleSerializer + */ + public static TripleSink connect(CharSink sink) { + return new NTriplesSerializer(sink); + } + + @Override + public void addNonLiteral(String subj, String pred, String obj) { + try { + startTriple(subj, pred); + serializeBnodeOrUri(obj); + sink.process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang) { + try { + startTriple(subj, pred); + addContent(content); + if (lang != null) { + sink.process('@').process(lang); + } + sink.process(SPACE).process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type) { + try { + startTriple(subj, pred); + addContent(content); + sink.process("^^"); + serializeUri(type); + sink.process(DOT_EOL); + } catch (ParseException e) { + // ignore + } + } + + @Override + protected boolean setPropertyInternal(String key, Object value) { + return false; + } + + @Override + public void setBaseUri(String baseUri) { + // ignore + } + + protected void startTriple(String subj, String pred) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + serializeBnodeOrUri(subj); + serializeBnodeOrUri(pred); + } + + protected void serializeBnodeOrUri(String value) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (value.startsWith(RDF.BNODE_PREFIX)) { + sink.process(value).process(SPACE); + } else { + serializeUri(value); + } + } + + protected void serializeUri(String uri) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + String escapedUri = escapeUri(uri); + sink.process(URI_START).process(escapedUri).process(URI_END).process(SPACE); + } + + protected void addContent(String content) throws ParseException { + String escapedContent = escapeContent(content); + sink.process(QUOTE).process(escapedContent).process(QUOTE); + } + + private static String escapeContent(String str) { + int limit = str.length(); + int pos = 0; + for (; pos < limit; pos++) { + char ch = str.charAt(pos); + if (ch > 0x80 || ESCAPABLE_CONTENT_CHARS.get(ch)) { + break; + } + } + if (pos == limit) { + return str; + } + StringBuilder result = new StringBuilder(limit); + result.append(str.substring(0, pos)); + for (; pos < limit; pos++) { + char ch = str.charAt(pos); + if (ch < 0x80) { + switch (ch) { + case '\\': + case '\"': + result.append('\\').append(ch); + break; + case '\b': + result.append("\\b"); + break; + case '\f': + result.append("\\f"); + break; + case '\n': + result.append("\\n"); + break; + case '\r': + result.append("\\r"); + break; + case '\t': + result.append("\\t"); + break; + default: + result.append(ch); + } + } else if (ch <= 0xFFFF) { + result.append("\\u").append(String.format("%04X", (int) ch)); + } else { + result.append("\\U").append(String.format("%08X", (int) ch)); + } + } + return result.toString(); + } + + private static String escapeUri(String str) { + int limit = str.length(); + int pos = 0; + for (; pos < limit; pos++) { + char ch = str.charAt(pos); + if (ch > 0x80 || ESCAPABLE_URI_CHARS.get(ch)) { + break; + } + } + if (pos == limit) { + return str; + } + StringBuilder result = new StringBuilder(limit); + result.append(str.substring(0, pos)); + for (; pos < limit; pos++) { + char ch = str.charAt(pos); + if (ESCAPABLE_URI_CHARS.get(ch)) { + result.append("\\u").append(String.format("%04X", (int) ch)); + } else if (ch < 0x80) { + result.append(ch); + } else if (ch <= 0xFFFF) { + result.append("\\u").append(String.format("%04X", (int) ch)); + } else { + result.append("\\U").append(String.format("%08X", (int) ch)); + } + } + return result.toString(); + } + + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ParseException.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ParseException.java new file mode 100644 index 00000000..7d232a28 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ParseException.java @@ -0,0 +1,33 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +public class ParseException extends Exception { + + private static final long serialVersionUID = 2088926094965976520L; + + public ParseException(String string) { + super(string); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } + + public ParseException(Throwable cause) { + super(cause); + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ProcessorGraphHandler.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ProcessorGraphHandler.java new file mode 100644 index 00000000..4a07385b --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/ProcessorGraphHandler.java @@ -0,0 +1,43 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +/** + * Interface for handling processor graph events + */ +public interface ProcessorGraphHandler { + + /** + * Callback for info events + * @param infoClass event class URI + * @param message info message + */ + void info(String infoClass, String message); + + /** + * Callback for warning events + * @param warningClass warning class URI + * @param message warning message + */ + void warning(String warningClass, String message); + + /** + * Callback for error events + * @param errorClass event class URI + * @param message error message + */ + void error(String errorClass, String message); +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/RdfXmlParser.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/RdfXmlParser.java new file mode 100644 index 00000000..b7bb3ced --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/RdfXmlParser.java @@ -0,0 +1,730 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; +import io.github.sparqlanything.html.org.semarglproject.ri.RIUtils; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.XmlSink; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; +import io.github.sparqlanything.html.org.semarglproject.xml.XmlUtils; +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import javax.xml.XMLConstants; +import java.util.*; + +/** + * Implementation of streaming RDF/XML parser. + *
+ * List of supported options: + *
    + *
  • {@link StreamProcessor#PROCESSOR_GRAPH_HANDLER_PROPERTY}
  • + *
  • {@link StreamProcessor#ENABLE_ERROR_RECOVERY}
  • + *
+ */ +public final class RdfXmlParser extends Pipe implements XmlSink { + + /** + * Class URI for errors produced by a parser + */ + public static final String ERROR = "http://semarglproject.org/ntriples/Error"; + + private static final String IS_NOT_ALLOWED_HERE = " is not allowed here"; + + // processing modes + private static final short INSIDE_OF_PROPERTY = 1; + private static final short INSIDE_OF_RESOURCE = 2; + private static final short PARSE_TYPE_LITERAL = 3; + private static final short PARSE_TYPE_COLLECTION = 4; + private static final short PARSE_TYPE_RESOURCE = 5; + private static final short ERROR_RECOVERY = 6; + + private static final String ID_ATTR = "ID"; + private static final String NODE_ID_ATTR = "nodeID"; + private static final String ABOUT_ATTR = "about"; + + private static final String PARSE_LITERAL_VALUE = "Literal"; + private static final String PARSE_RESOURCE_VALUE = "Resource"; + private static final String PARSE_COLLECTION_VALUE = "Collection"; + + private short mode = 0; + + private String baseUri = ""; + + private final Stack modeStack = new Stack(); + private final Stack langStack = new Stack(); + private final Stack baseStack = new Stack(); + private final Stack subjStack = new Stack(); + private final Stack subjLiIndexStack = new Stack(); + private final Map nsMappings = new HashMap(); + + private final Set processedIDs = new HashSet(); + + private int bnodeId = 0; + + // IRI or bnode + private String subjRes = null; + + // tail node of parseType="Collection" + private String seqTailRes = null; + + // predicate IRI + private String predIri = null; + + // typed literal datatype IRI + private String datatypeIri = null; + + private String reifyIri = null; + private boolean captureLiteral = false; + + private int parseDepth = 0; + private StringBuilder parse = new StringBuilder(); + + private io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler processorGraphHandler = null; + private boolean ignoreErrors = false; + + // holds data for triples which addition depends on XML node contents (blank or not) + private List pendingTriples = new ArrayList(); + + private RdfXmlParser(TripleSink sink) { + super(sink); + } + + /** + * Creates instance of RdfXmlParser connected to specified sink. + * @param sink sink to be connected to + * @return instance of RdfXmlParser + */ + public static XmlSink connect(TripleSink sink) { + return new RdfXmlParser(sink); + } + + private void error(String msg) throws SAXException { + if (processorGraphHandler != null) { + processorGraphHandler.error(ERROR, msg); + } + if (ignoreErrors) { + modeStack.push(mode); + mode = ERROR_RECOVERY; + } else { + throw new SAXException(new io.github.sparqlanything.html.org.semarglproject.rdf.ParseException(msg)); + } + } + + @SuppressWarnings("deprecation") + private boolean violatesSchema(String nodeIri) { + return nodeIri == null || nodeIri.isEmpty() || nodeIri.equals(RDF.PARSE_TYPE) + || nodeIri.equals(RDF.ABOUT_EACH) || nodeIri.equals(RDF.DATATYPE) + || nodeIri.equals(RDF.BAG_ID) || nodeIri.equals(RDF.ABOUT) + || nodeIri.equals(RDF.RESOURCE) || nodeIri.equals(RDF.NODEID) + || nodeIri.equals(RDF.ID) || nodeIri.equals(RDF.ABOUT_EACH_PREFIX); + } + + @Override + public void startElement(String nsUri, String lname, String qname, Attributes attrs) throws SAXException { + processPendingTriples(true); + + modeStack.push(mode); + + if (parseDepth > 0) { + parseDepth++; + if (mode == PARSE_TYPE_LITERAL) { + parse.append(XmlUtils.serializeOpenTag(nsUri, qname, nsMappings, attrs, true)); + nsMappings.clear(); + return; + } + } + + if (mode == ERROR_RECOVERY) { + return; + } + + processLangAndBase(attrs); + + String iri = nsUri + lname; + if (subjRes == null && (nsUri == null || nsUri.isEmpty()) || iri.equals(RDF.RDF)) { + return; + } + if (violatesSchema(iri)) { + error(qname + IS_NOT_ALLOWED_HERE); + } + + switch (mode) { + case PARSE_TYPE_COLLECTION: + case INSIDE_OF_PROPERTY: { + subjRes = getSubject(attrs); + if (subjRes == null) { + // error during subject processing was ignored so we need to skip next steps + return; + } + + if (mode != PARSE_TYPE_COLLECTION && !subjStack.isEmpty()) { + processNonLiteralTriple(subjStack.peek(), predIri, subjRes); + } + + if (!iri.equals(RDF.DESCRIPTION)) { + if (iri.equals(RDF.LI)) { + error(qname + IS_NOT_ALLOWED_HERE); + } else { + sink.addNonLiteral(subjRes, RDF.TYPE, iri); + } + } + + processResourceAttrs(qname, attrs); + + subjStack.push(subjRes); + subjLiIndexStack.push(1); + if (mode == INSIDE_OF_PROPERTY) { + mode = INSIDE_OF_RESOURCE; + } + break; + } + case PARSE_TYPE_RESOURCE: + case INSIDE_OF_RESOURCE: { + int liIndex = subjLiIndexStack.pop(); + + boolean correctProperty = checkPropertyForErrors(qname, iri, attrs); + + if (!correctProperty) { + // error during property processing was ignored so we need to skip next steps + return; + } + + predIri = iri; + if (predIri.equals(RDF.LI)) { + predIri = RDF.NS + "_" + liIndex++; + } + subjLiIndexStack.push(liIndex); + + String nodeId = attrs.getValue(RDF.NS, ID_ATTR); + if (nodeId != null) { + reifyIri = resolveIRINoResolve(baseStack.peek(), nodeId); + } + + captureLiteral = true; + mode = INSIDE_OF_PROPERTY; + processPropertyAttrs(nsUri, attrs); + if (captureLiteral) { + parse = new StringBuilder(); + } + break; + } + default: + throw new IllegalStateException("Unknown mode = " + mode); + } + } + + private void processPendingTriples(boolean forceNewBNode) { + Iterator iterator = pendingTriples.iterator(); + while (iterator.hasNext()) { + String propRes = iterator.next(); + String attr = iterator.next(); + String value = iterator.next(); + if (forceNewBNode || propRes == null) { + String bnode = newBnode(); + processNonLiteralTriple(subjRes, predIri, bnode); + sink.addPlainLiteral(bnode, attr, value, langStack.peek()); + } else { + sink.addPlainLiteral(propRes, attr, value, langStack.peek()); + } + } + pendingTriples.clear(); + } + + private boolean checkPropertyForErrors(String qname, String iri, Attributes attrs) throws SAXException { + if (iri.equals(RDF.NIL) || iri.equals(RDF.DESCRIPTION)) { + error(qname + IS_NOT_ALLOWED_HERE); + return false; + } + if (!RIUtils.isIri(iri)) { + error("Invalid property IRI"); + return false; + } + + if (attrs.getValue(RDF.NS, "resource") != null && attrs.getValue(RDF.NS, NODE_ID_ATTR) != null) { + error("Both rdf:resource and rdf:nodeID are present"); + return false; + } + if (attrs.getValue(RDF.NS, "parseType") != null && !isAttrsValidForParseType(attrs)) { + error("rdf:parseType conflicts with other attributes"); + return false; + } + return true; + } + + private void processResourceAttrs(String qname, Attributes attrs) throws SAXException { + for (int i = 0; i < attrs.getLength(); i++) { + String tag = attrs.getURI(i) + attrs.getLocalName(i); + if (tag.equals(RDF.NODEID) || tag.equals(RDF.ABOUT) || tag.equals(RDF.ID) + || attrs.getQName(i).startsWith(XMLConstants.XML_NS_PREFIX)) { + continue; + } + String value = attrs.getValue(i); + if (tag.equals(RDF.TYPE)) { + sink.addNonLiteral(subjRes, RDF.TYPE, value); + } else { + if (violatesSchema(tag) || tag.equals(RDF.LI)) { + error(qname + IS_NOT_ALLOWED_HERE); + } else { + sink.addPlainLiteral(subjRes, tag, value, langStack.peek()); + } + } + } + } + + private void processPropertyAttrs(String nsUri, Attributes attrs) throws SAXException { + // process resource first + int resIdx = attrs.getIndex(RDF.NS, "resource"); + String propertyRes = null; + if (resIdx >= 0) { + propertyRes = processPropertyRes(attrs.getValue(resIdx)); + } + + for (int i = 0; i < attrs.getLength(); i++) { + if (i == resIdx) { + continue; + } + String attr = attrs.getURI(i) + attrs.getLocalName(i); + if (attrs.getQName(i).startsWith(XMLConstants.XML_NS_PREFIX) || attr.equals(RDF.ID)) { + continue; + } + processPropertyTagAttr(nsUri, attr, attrs.getValue(i), propertyRes); + } + } + + private void processLangAndBase(Attributes attrs) throws SAXException { + String lang = langStack.peek(); + if (attrs.getValue(XmlUtils.XML_LANG) != null) { + lang = attrs.getValue(XmlUtils.XML_LANG); + } + langStack.push(lang); + + String base = baseStack.peek(); + if (attrs.getValue(XmlUtils.XML_BASE) != null) { + base = attrs.getValue(XmlUtils.XML_BASE); + if (base.contains("#")) { + base = base.substring(0, base.lastIndexOf('#')); + } + base += '#'; + if (!RIUtils.isAbsoluteIri(base)) { + error("Invalid base IRI"); + base = baseStack.peek(); + } + } + baseStack.push(base); + } + + private String processPropertyRes(String value) throws SAXException { + String propertyRes = resolveIRI(baseStack.peek(), value); + if (propertyRes != null) { + processNonLiteralTriple(subjRes, predIri, propertyRes); + captureLiteral = false; + } + return propertyRes; + } + + private void processPropertyTagAttr(String nsUri, String attr, String value, + String propertyRes) throws SAXException { + if (attr.equals(RDF.DATATYPE)) { + datatypeIri = resolveIRINoResolve(nsUri, value); + } else if (attr.equals(RDF.PARSE_TYPE)) { + parseDepth = 1; + if (value.equalsIgnoreCase(PARSE_LITERAL_VALUE)) { + parse = new StringBuilder(); + mode = PARSE_TYPE_LITERAL; + } else if (value.equalsIgnoreCase(PARSE_RESOURCE_VALUE)) { + String bnode = newBnode(); + processNonLiteralTriple(subjRes, predIri, bnode); + subjRes = bnode; + subjStack.push(subjRes); + subjLiIndexStack.push(1); + mode = PARSE_TYPE_RESOURCE; + } else if (value.equalsIgnoreCase(PARSE_COLLECTION_VALUE)) { + String bnode = newBnode(); + sink.addNonLiteral(subjRes, predIri, bnode); + subjRes = bnode; + seqTailRes = null; + subjStack.push(bnode); + subjLiIndexStack.push(1); + mode = PARSE_TYPE_COLLECTION; + } + captureLiteral = false; + } else if (attr.equals(RDF.NODEID)) { + if (!XmlUtils.isValidNCName(value)) { + error("Invalid nodeID"); + } else { + String id = RDF.BNODE_PREFIX + 'n' + value.hashCode(); + processNonLiteralTriple(subjRes, predIri, id); + captureLiteral = false; + } + } else { + if (violatesSchema(attr) || attr.equals(RDF.NIL)) { + error(attr + IS_NOT_ALLOWED_HERE); + } else { + pendingTriples.add(propertyRes); + pendingTriples.add(attr); + pendingTriples.add(value); + captureLiteral = false; + } + } + } + + @Override + public void endElement(String namespaceUri, String lname, String qname) throws SAXException { + processPendingTriples(false); + if (parseDepth > 0) { + parseDepth--; + if (mode == PARSE_TYPE_LITERAL && parseDepth > 0) { + parse.append(""); + return; + } + } + if (subjStack.isEmpty()) { + return; + } + + switch (mode) { + case PARSE_TYPE_RESOURCE: + case INSIDE_OF_RESOURCE: { + subjStack.pop(); + if (!subjStack.isEmpty()) { + subjRes = subjStack.peek(); + } + subjLiIndexStack.pop(); + if (mode == INSIDE_OF_RESOURCE) { + mode = INSIDE_OF_PROPERTY; + } else { + mode = INSIDE_OF_RESOURCE; + } + break; + } + case PARSE_TYPE_COLLECTION: { + subjStack.pop(); + subjLiIndexStack.pop(); + if (parseDepth > 0) { + if (seqTailRes == null) { + seqTailRes = subjStack.peek(); + sink.addNonLiteral(seqTailRes, RDF.FIRST, subjRes); + } else { + String bnode = newBnode(); + sink.addNonLiteral(seqTailRes, RDF.REST, bnode); + sink.addNonLiteral(bnode, RDF.FIRST, subjRes); + seqTailRes = bnode; + } + } else { + sink.addNonLiteral(seqTailRes, RDF.REST, RDF.NIL); + if (!subjStack.isEmpty()) { + subjRes = subjStack.peek(); + } + mode = INSIDE_OF_RESOURCE; + } + break; + } + case INSIDE_OF_PROPERTY: { + if (captureLiteral) { + String value = parse.toString(); + if (datatypeIri != null) { + processLiteralTriple(subjRes, predIri, value, datatypeIri, true); + } else { + processLiteralTriple(subjRes, predIri, value, langStack.peek(), false); + } + captureLiteral = false; + } + mode = INSIDE_OF_RESOURCE; + break; + } + case PARSE_TYPE_LITERAL: { + processLiteralTriple(subjRes, predIri, parse.toString(), RDF.XML_LITERAL, true); + mode = INSIDE_OF_RESOURCE; + break; + } + case ERROR_RECOVERY: { + mode = modeStack.pop(); + return; + } + default: + throw new IllegalStateException("Unknown mode = " + mode); + } + langStack.pop(); + baseStack.pop(); + // TODO: fix modeStack + short savedMode = modeStack.pop(); + if (savedMode == PARSE_TYPE_RESOURCE) { + mode = savedMode; + } + } + + private boolean isAttrsValidForParseType(Attributes attrs) { + for (int i = 0; i < attrs.getLength(); i++) { + if (attrs.getQName(i).startsWith("xml")) { + continue; + } + String uri = attrs.getURI(i) + attrs.getLocalName(i); + if (uri.equals(RDF.PARSE_TYPE) || uri.equals(RDF.ID)) { + continue; + } + return false; + } + return true; + } + + private void processNonLiteralTriple(String subj, String pred, String obj) { + sink.addNonLiteral(subj, pred, obj); + if (reifyIri != null) { + sink.addNonLiteral(reifyIri, RDF.TYPE, RDF.STATEMENT); + sink.addNonLiteral(reifyIri, RDF.SUBJECT, subj); + sink.addNonLiteral(reifyIri, RDF.PREDICATE, pred); + sink.addNonLiteral(reifyIri, RDF.OBJECT, obj); + reifyIri = null; + } + } + + private void processLiteralTriple(String subj, String pred, String value, String langOrDt, boolean typed) { + if (typed) { + sink.addTypedLiteral(subj, pred, value, langOrDt); + } else { + sink.addPlainLiteral(subj, pred, value, langOrDt); + } + if (reifyIri != null) { + sink.addNonLiteral(reifyIri, RDF.TYPE, RDF.STATEMENT); + sink.addNonLiteral(reifyIri, RDF.SUBJECT, subj); + sink.addNonLiteral(reifyIri, RDF.PREDICATE, pred); + if (typed) { + sink.addTypedLiteral(reifyIri, RDF.OBJECT, value, langOrDt); + } else { + sink.addPlainLiteral(reifyIri, RDF.OBJECT, value, langOrDt); + } + reifyIri = null; + } + } + + private String getSubject(Attributes attrs) throws SAXException { + int count = 0; + String result = null; + String attrValue = attrs.getValue(RDF.NS, ABOUT_ATTR); + if (attrValue != null) { + result = resolveIRI(baseStack.peek(), attrValue); + if (result != null) { + count++; + } + } + attrValue = attrs.getValue(RDF.NS, ID_ATTR); + if (attrValue != null) { + result = resolveIRINoResolve(baseStack.peek(), attrValue); + if (result != null) { + if (processedIDs.contains(result)) { + error("Duplicate definition for resource ID = " + result); + return null; + } + processedIDs.add(result); + count++; + } + } + attrValue = attrs.getValue(RDF.NS, NODE_ID_ATTR); + if (attrValue != null) { + result = RDF.BNODE_PREFIX + 'n' + attrValue.hashCode(); + count++; + } + if (count == 0) { + return newBnode(); + } + if (count > 1) { + error("Ambiguous identifier definition"); + return null; + } + return result; + } + + private String newBnode() { + bnodeId++; + return RDF.BNODE_PREFIX + 'n' + bnodeId; + } + + /** + * Resolves specified IRI ignoring special cases + * @param baseIri base to resolve against + * @param iri IRI to resolve + * @return resolved IRI or null on error + * @throws SAXException + */ + private String resolveIRINoResolve(String baseIri, String iri) throws SAXException { + if (RIUtils.isAbsoluteIri(iri)) { + return iri; + } + if (!XmlUtils.isValidNCName(iri)) { + error("Vocab term must be a valid NCName"); + return null; + } + String result = baseIri + iri; + if (RIUtils.isAbsoluteIri(result)) { + return result; + } + error("Malformed IRI: " + iri); + return null; + } + + /** + * Resolves specified IRI + * @param baseIri base to resolve against + * @param iri IRI to resolve + * @return resolved IRI or null on error + * @throws SAXException + */ + private String resolveIRI(String baseIri, String iri) throws SAXException { + try { + return RIUtils.resolveIri(baseIri, iri); + } catch (MalformedIriException e) { + error(e.getMessage()); + return null; + } + } + + @Override + public void startDocument() throws SAXException { + mode = INSIDE_OF_PROPERTY; + sink.setBaseUri(baseUri); + baseStack.push(baseUri); + langStack.push(null); + captureLiteral = false; + subjRes = null; + seqTailRes = null; + predIri = null; + datatypeIri = null; + reifyIri = null; + parseDepth = 0; + } + + @Override + public void endDocument() throws SAXException { + langStack.clear(); + baseStack.clear(); + subjStack.clear(); + modeStack.clear(); + subjLiIndexStack.clear(); + nsMappings.clear(); + processedIDs.clear(); + parse = new StringBuilder(); + pendingTriples.clear(); + } + + @Override + public void characters(char[] buffer, int offset, int length) throws SAXException { + processPendingTriples(true); + if (mode == PARSE_TYPE_LITERAL || captureLiteral) { + parse.append(String.copyValueOf(buffer, offset, length)); + } + } + + @Override + public void ignorableWhitespace(char[] buffer, int offset, int length) throws SAXException { + characters(buffer, offset, length); + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + processPendingTriples(true); + if (parseDepth > 0 && mode == PARSE_TYPE_LITERAL) { + parse.append(""); + } + } + + @Override + public void comment(char[] buffer, int offset, int length) throws SAXException { + processPendingTriples(true); + if (parseDepth > 0 && mode == PARSE_TYPE_LITERAL) { + parse.append(""); + } + } + + @Override + public void startPrefixMapping(String abbr, String uri) throws SAXException { + if (mode == PARSE_TYPE_LITERAL) { + nsMappings.put(abbr, uri); + } + } + + @Override + public void setBaseUri(String baseUri) { + if (baseUri != null && !baseUri.isEmpty() && Character.isLetter(baseUri.charAt(baseUri.length() - 1))) { + this.baseUri = baseUri + "#"; + } else { + this.baseUri = baseUri == null ? "" : baseUri; + } + } + + @Override + public void setDocumentLocator(Locator arg0) { + } + + @Override + public void skippedEntity(String arg0) throws SAXException { + } + + @Override + public void endPrefixMapping(String arg0) throws SAXException { + } + + @Override + public void endCDATA() throws SAXException { + } + + @Override + public void endDTD() throws SAXException { + } + + @Override + public void endEntity(String arg0) throws SAXException { + } + + @Override + public void startCDATA() throws SAXException { + } + + @Override + public void startDTD(String arg0, String arg1, String arg2) throws SAXException { + } + + @Override + public void startEntity(String arg0) throws SAXException { + } + + @Override + public io.github.sparqlanything.html.org.semarglproject.rdf.ParseException processException(SAXException e) { + Throwable cause = e.getCause(); + if (cause instanceof io.github.sparqlanything.html.org.semarglproject.rdf.ParseException) { + return (ParseException) cause; + } + return new ParseException(e); + } + + @Override + protected boolean setPropertyInternal(String key, Object value) { + if (StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY.equals(key) && value instanceof io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler) { + processorGraphHandler = (ProcessorGraphHandler) value; + } else if (StreamProcessor.ENABLE_ERROR_RECOVERY.equals(key) && value instanceof Boolean) { + ignoreErrors = (Boolean) value; + } + return false; + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/TurtleSerializer.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/TurtleSerializer.java new file mode 100644 index 00000000..ee45d0c9 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/TurtleSerializer.java @@ -0,0 +1,239 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Set; + +/** + * Implementation of {@link TripleSink} which serializes triples to {@link CharSink} using + * Turtle syntax. * + */ +public final class TurtleSerializer extends Pipe implements TripleSink { + + private static final String DOT_EOL = " .\n"; + private static final String COMMA_EOL = " ,\n"; + private static final String SEMICOLON_EOL = " ;\n"; + private static final String EOL = "\n"; + + private static final String MULTILINE_QUOTE = "\"\"\""; + private static final char SINGLE_LINE_QUOTE = '"'; + private static final char BNODE_START = '['; + private static final char BNODE_END = ']'; + private static final char URI_START = '<'; + private static final char URI_END = '>'; + + private static final char SPACE = ' '; + private static final char RDF_TYPE_ABBR = 'a'; + private static final String INDENT = " "; + + private String prevSubj; + private String prevPred; + private final Queue bnodeStack = new LinkedList(); + private final Set namedBnodes = new HashSet(); + private String baseUri; + + private TurtleSerializer(CharSink sink) { + super(sink); + } + + /** + * Creates instance of TurtleSerializer connected to specified sink. + * @param sink sink to be connected to + * @return instance of TurtleSerializer + */ + public static TripleSink connect(CharSink sink) { + return new TurtleSerializer(sink); + } + + @Override + public void addNonLiteral(String subj, String pred, String obj) { + try { + startTriple(subj, pred); + if (obj.startsWith(RDF.BNODE_PREFIX)) { + if (!namedBnodes.contains(obj) && obj.endsWith(RDF.SHORTENABLE_BNODE_SUFFIX)) { + openBnode(obj); + } else { + sink.process(obj); + } + } else { + serializeUri(obj); + } + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang) { + try { + startTriple(subj, pred); + addContent(content); + if (lang != null) { + sink.process('@').process(lang); + } + } catch (ParseException e) { + // ignore + } + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type) { + try { + startTriple(subj, pred); + addContent(content); + sink.process("^^"); + serializeUri(type); + } catch (ParseException e) { + // ignore + } + } + + @Override + public void startStream() throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + super.startStream(); + prevSubj = null; + prevPred = null; + if (baseUri != null) { + sink.process("@base ").process(URI_START).process(baseUri).process(URI_END).process(DOT_EOL); + } + sink.process("@prefix rdf: ").process(URI_START).process(RDF.NS).process(URI_END).process(DOT_EOL); + bnodeStack.clear(); + namedBnodes.clear(); + } + + @Override + public void endStream() throws ParseException { + while (!bnodeStack.isEmpty()) { + closeBnode(); + } + if (prevPred != null) { + sink.process(DOT_EOL); + } else { + sink.process(EOL); + } + baseUri = null; + super.endStream(); + } + + @Override + protected boolean setPropertyInternal(String key, Object value) { + return false; + } + + @Override + public void setBaseUri(String baseUri) { + this.baseUri = baseUri.substring(0, baseUri.length() - 1); + } + + private void startTriple(String subj, String pred) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (subj.equals(prevSubj)) { + if (pred.equals(prevPred)) { + sink.process(COMMA_EOL); + indent(2); + } else if (prevPred != null) { + sink.process(SEMICOLON_EOL); + indent(1); + serializePredicate(pred); + } else { + indent(0); + serializePredicate(pred); + } + } else { + if (!bnodeStack.isEmpty()) { + closeBnode(); + startTriple(subj, pred); + return; + } else if (prevSubj != null) { + sink.process(DOT_EOL); + } + if (subj.startsWith(RDF.BNODE_PREFIX)) { + if (subj.endsWith(RDF.SHORTENABLE_BNODE_SUFFIX)) { + openBnode(subj); + } else { + sink.process(subj).process(SPACE); + namedBnodes.add(subj); + } + } else { + serializeUri(subj); + } + serializePredicate(pred); + } + prevSubj = subj; + prevPred = pred; + } + + private void serializePredicate(String pred) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + if (RDF.TYPE.equals(pred)) { + sink.process(RDF_TYPE_ABBR).process(SPACE); + } else { + serializeUri(pred); + } + } + + private void serializeUri(String uri) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + String escapedUri = uri.replace("\\", "\\\\").replace(">", "\\u003E"); + if (escapedUri.startsWith(RDF.NS)) { + sink.process("rdf:").process(escapedUri.substring(RDF.NS.length())); + } else if (baseUri != null && escapedUri.startsWith(baseUri)) { + sink.process(URI_START).process(escapedUri.substring(baseUri.length())).process(URI_END); + } else { + sink.process(URI_START).process(escapedUri).process(URI_END); + } + sink.process(SPACE); + } + + private void indent(int additionalIndent) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + for (int i = 0; i < bnodeStack.size() + additionalIndent; i++) { + sink.process(INDENT); + } + } + + private void addContent(String content) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + String escapedContent = content.replace("\\", "\\\\").replace("\"", "\\\""); + if (escapedContent.contains(EOL)) { + sink.process(MULTILINE_QUOTE).process(escapedContent).process(MULTILINE_QUOTE); + } else { + sink.process(SINGLE_LINE_QUOTE).process(escapedContent).process(SINGLE_LINE_QUOTE); + } + } + + private void openBnode(String obj) throws io.github.sparqlanything.html.org.semarglproject.rdf.ParseException { + sink.process(BNODE_START); + bnodeStack.offer(obj); + prevSubj = obj; + prevPred = null; + } + + private void closeBnode() throws ParseException { + sink.process(BNODE_END); + bnodeStack.poll(); + prevSubj = bnodeStack.peek(); + prevPred = null; + if (prevSubj == null) { + sink.process(DOT_EOL); + } + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/DocumentContext.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/DocumentContext.java new file mode 100644 index 00000000..94eaebe8 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/DocumentContext.java @@ -0,0 +1,144 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf.rdfa; + +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary; +import io.github.sparqlanything.html.org.semarglproject.ri.RIUtils; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDFa; + +import java.util.HashMap; +import java.util.Map; + +final class DocumentContext { + + static final short FORMAT_UNKNOWN = 0; + static final short FORMAT_HTML4 = 1; + static final short FORMAT_HTML5 = 2; + static final short FORMAT_XML = 3; + static final short FORMAT_SVG = 4; + + private static final String RDFA_10_STRING = "rdfa 1.0"; + + private static final String HTML_ROOT_ELEMENT = "html"; + private static final String HTML_BASE = "base"; + private static final String SVG_ROOT_ELEMENT = "svg"; + + short documentFormat; + short rdfaVersion; + + final io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser parser; + + String base; + String originUri; + + private Map bnodeMapping = new HashMap(); + private int nextBnodeId; + + DocumentContext(RdfaParser parser) { + this.parser = parser; + nextBnodeId = 0; + clear(); + } + + String resolveBNode(String value) { + if (value.startsWith(RDF.BNODE_PREFIX) || value.startsWith('[' + RDF.BNODE_PREFIX) + && value.charAt(value.length() - 1) == ']') { + String name; + if (value.charAt(0) == '[') { + name = value.substring(RDF.BNODE_PREFIX.length() + 1, value.length() - 1); + } else { + name = value.substring(RDF.BNODE_PREFIX.length()); + } + if (!bnodeMapping.containsKey(name)) { + bnodeMapping.put(name, createBnode(false)); + } + return bnodeMapping.get(name); + } + return null; + } + + void detectFormat(String localName, String qName, String version) { + if (documentFormat == FORMAT_UNKNOWN) { + if (localName.equals(SVG_ROOT_ELEMENT)) { + documentFormat = FORMAT_SVG; + } else if (localName.equalsIgnoreCase(HTML_ROOT_ELEMENT)) { + documentFormat = FORMAT_HTML4; + } else { + documentFormat = FORMAT_XML; + } + } + if (qName.equalsIgnoreCase(HTML_ROOT_ELEMENT) && version != null + && version.toLowerCase().contains(RDFA_10_STRING)) { + rdfaVersion = RDFa.VERSION_10; + } + } + + void detectBase(String qName, String xmlBase, String hRef) { + boolean xmlBaseF = (documentFormat == FORMAT_XML || documentFormat == FORMAT_SVG) && xmlBase != null; + if (xmlBaseF || qName.equalsIgnoreCase(HTML_BASE) && hRef != null) { + base = (xmlBaseF ? xmlBase : hRef).replaceAll("#.*", ""); + } + } + + String createBnode(boolean shortenable) { + if (shortenable) { + return RDF.BNODE_PREFIX + 'n' + (nextBnodeId++) + RDF.SHORTENABLE_BNODE_SUFFIX; + } + return RDF.BNODE_PREFIX + 'n' + nextBnodeId++; + } + + void processDtd(String name, String publicId, String systemId) { + if (publicId == null) { + if (HTML_ROOT_ELEMENT.equalsIgnoreCase(name)) { + documentFormat = FORMAT_HTML5; + } + } else { + String publicIdLower = publicId.toLowerCase(); + if (publicIdLower.contains(HTML_ROOT_ELEMENT)) { + documentFormat = FORMAT_HTML4; + } + if (publicIdLower.contains(RDFA_10_STRING)) { + rdfaVersion = RDFa.VERSION_10; + } + } + } + + String resolveIri(String iri) throws MalformedIriException { + return RIUtils.resolveIri(base, iri); + } + + void clear() { + rdfaVersion = parser.getRdfaVersion(); + documentFormat = FORMAT_UNKNOWN; + bnodeMapping = new HashMap(); + base = null; + originUri = null; + } + + io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary loadVocabulary(String vocabUrl) { + return parser.loadVocabulary(vocabUrl); + } + + void setBaseUri(String baseUri) { + if (base == null) { + originUri = baseUri; + } + this.base = baseUri; + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/EvalContext.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/EvalContext.java new file mode 100644 index 00000000..4a1e7b22 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/EvalContext.java @@ -0,0 +1,354 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf.rdfa; + +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.DocumentContext; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary; +import io.github.sparqlanything.html.org.semarglproject.ri.RIUtils; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedCurieException; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDFa; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Pattern; + +final class EvalContext { + + // Initial context described in http://www.w3.org/2011/rdfa-context/rdfa-1.1.html + private static final Map RDFA11_INITIAL_CONTEXT = new HashMap(); + private static final Pattern TERM_PATTERN = Pattern.compile("[a-zA-Z0-9_-]+", Pattern.DOTALL); + + private static final String CAN_NOT_RESOLVE_TERM = "Can't resolve term "; + + private static final String XHTML_VOCAB = "http://www.w3.org/1999/xhtml/vocab#"; + private static final String POWDER_DESCRIBED_BY = "http://www.w3.org/2007/05/powder-s#describedby"; + + private static final String[] XHTML_VOCAB_PROPS = { + // XHTML Metainformation Vocabulary + "alternate", "appendix", "bookmark", "cite", "chapter", "contents", + "copyright", "first", "glossary", "help", "icon", "index", "itsRules", + "last", "license", "meta", "next", "p3pv1", "prev", "previous", "role", + "section", "stylesheet", "subsection", "start","top", "up", + + // Items from the XHTML Role Module + "banner", "complementary", "contentinfo", "definition", "main", + "navigation", "note", "search", + + // Items from the Accessible Rich Internet Applications Vocabulary + "alert", "alertdialog", "application", "article", "button", "checkbox", + "columnheader", "combobox", "dialog", "directory", "document", "form", + "grid", "gridcell", "group", "heading", "img", "link", "list", "listbox", + "listitem", "log", "marquee", "math", "menu", "menubar", "menuitem", + "menuitemcheckbox", "menuitemradio", "option", "presentation", + "progressbar", "radio", "radiogroup", "region", "row", "rowgroup", + "rowheader", "scrollbar", "separator", "slider", "spinbutton", "status", + "tab", "tablist", "tabpanel", "textbox", "timer", "toolbar", "tooltip", + "tree", "treegrid", "treeitem" + }; + + static { + // Vocabulary Prefixes of W3C Documents + RDFA11_INITIAL_CONTEXT.put("owl", "http://www.w3.org/2002/07/owl#"); + RDFA11_INITIAL_CONTEXT.put("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); + RDFA11_INITIAL_CONTEXT.put("rdfs", "http://www.w3.org/2000/01/rdf-schema#"); + RDFA11_INITIAL_CONTEXT.put("rdfa", "http://www.w3.org/ns/rdfa#"); + RDFA11_INITIAL_CONTEXT.put("xhv", "http://www.w3.org/1999/xhtml/vocab#"); + RDFA11_INITIAL_CONTEXT.put("xsd", "http://www.w3.org/2001/XMLSchema#"); + RDFA11_INITIAL_CONTEXT.put("grddl", "http://www.w3.org/2003/g/data-view#"); + RDFA11_INITIAL_CONTEXT.put("ma", "http://www.w3.org/ns/ma-ont#"); + RDFA11_INITIAL_CONTEXT.put("rif", "http://www.w3.org/2007/rif#"); + RDFA11_INITIAL_CONTEXT.put("skos", "http://www.w3.org/2004/02/skos/core#"); + RDFA11_INITIAL_CONTEXT.put("skosxl", "http://www.w3.org/2008/05/skos-xl#"); + RDFA11_INITIAL_CONTEXT.put("wdr", "http://www.w3.org/2007/05/powder#"); + RDFA11_INITIAL_CONTEXT.put("void", "http://rdfs.org/ns/void#"); + RDFA11_INITIAL_CONTEXT.put("wdrs", "http://www.w3.org/2007/05/powder-s#"); + RDFA11_INITIAL_CONTEXT.put("xml", "http://www.w3.org/XML/1998/namespace"); + + // Widely used Vocabulary prefixes + RDFA11_INITIAL_CONTEXT.put("cc", "http://creativecommons.org/ns#"); + RDFA11_INITIAL_CONTEXT.put("ctag", "http://commontag.org/ns#"); + RDFA11_INITIAL_CONTEXT.put("dc", "http://purl.org/dc/terms/"); + RDFA11_INITIAL_CONTEXT.put("dcterms", "http://purl.org/dc/terms/"); + RDFA11_INITIAL_CONTEXT.put("foaf", "http://xmlns.com/foaf/0.1/"); + RDFA11_INITIAL_CONTEXT.put("gr", "http://purl.org/goodrelations/v1#"); + RDFA11_INITIAL_CONTEXT.put("ical", "http://www.w3.org/2002/12/cal/icaltzd#"); + RDFA11_INITIAL_CONTEXT.put("og", "http://ogp.me/ns#"); + RDFA11_INITIAL_CONTEXT.put("rev", "http://purl.org/stuff/rev#"); + RDFA11_INITIAL_CONTEXT.put("sioc", "http://rdfs.org/sioc/ns#"); + RDFA11_INITIAL_CONTEXT.put("v", "http://rdf.data-vocabulary.org/#"); + RDFA11_INITIAL_CONTEXT.put("vcard", "http://www.w3.org/2006/vcard/ns#"); + RDFA11_INITIAL_CONTEXT.put("schema", "http://schema.org/"); + } + + Map iriMappings; + String subject; + String object; + List incomplTriples; + String lang; + String objectLit; + String objectLitDt; + List properties; + boolean parsingLiteral; + Map> listMapping; + + private final DocumentContext documentContext; + private io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary vocab; + private String profile; + + private EvalContext(String lang, io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary vocab, String profile, DocumentContext documentContext) { + this.subject = null; + this.object = null; + this.iriMappings = null; + this.incomplTriples = new ArrayList(); + this.lang = lang; + this.objectLit = null; + this.objectLitDt = null; + this.vocab = vocab; + this.profile = profile; + this.properties = null; + this.parsingLiteral = false; + this.listMapping = null; + this.documentContext = documentContext; + } + + static EvalContext createInitialContext(DocumentContext documentContext) { + // RDFa Core 1.0 processing sequence step 1 + EvalContext initialContext = new EvalContext(null, null, null, documentContext); + initialContext.subject = documentContext.base; + initialContext.listMapping = new HashMap>(); + initialContext.iriMappings = new TreeMap(); + return initialContext; + } + + EvalContext initChildContext(String profile, String vocab, String lang, + Map overwriteMappings) { + // RDFa Core 1.0 processing sequence step 2 + EvalContext current = new EvalContext(this.lang, this.vocab, this.profile, documentContext); + current.listMapping = this.listMapping; + current.initPrefixMappings(iriMappings, overwriteMappings); + + if (documentContext.rdfaVersion > RDFa.VERSION_10) { + if (profile != null) { + String newProfile = profile + "#"; + if (current.profile == null) { + current.profile = newProfile; + } else { + current.profile = newProfile + ' ' + current.profile; + } + } + if (vocab != null) { + if (vocab.length() == 0) { + current.vocab = null; + } else { + current.vocab = documentContext.loadVocabulary(vocab); + } + } + } + + // RDFa Core 1.0 processing sequence step 3 + if (lang != null) { + current.lang = lang; + } + if (current.lang != null && current.lang.isEmpty()) { + current.lang = null; + } + return current; + } + + private void initPrefixMappings(Map parentMappings, Map overwriteMappings) { + if (overwriteMappings.isEmpty()) { + iriMappings = parentMappings; + } else { + iriMappings = new TreeMap(parentMappings); + iriMappings.putAll(overwriteMappings); + } + + if (documentContext.rdfaVersion > RDFa.VERSION_10) { + for (String prefix : overwriteMappings.keySet()) { + String standardMapping = RDFA11_INITIAL_CONTEXT.get(prefix); + String newMapping = overwriteMappings.get(prefix); + if (standardMapping != null && !standardMapping.equals(newMapping)) { + documentContext.parser.warning(RDFa.PREFIX_REDEFINITION, "Standard prefix " + + prefix + ": redefined to <" + newMapping + '>'); + } + } + } + } + + List getMappingForIri(String iri) { + if (!listMapping.containsKey(iri)) { + listMapping.put(iri, new ArrayList()); + } + return listMapping.get(iri); + } + + void addContent(String content) { + objectLit += content; + } + + void updateBase(String oldBase, String base) { + if (object != null && object.equals(oldBase)) { + object = base; + } + if (subject != null && subject.equals(oldBase)) { + subject = base; + } + } + + /** + * Resolves @predicate or @datatype according to RDFa Core 1.1 section 5 + * + * @param value value of attribute + * @return resource IRI + * @throws MalformedIriException if IRI can not be resolved + */ + String resolvePredOrDatatype(String value) throws MalformedIriException { + if (value == null || value.isEmpty()) { + throw new MalformedIriException("Empty predicate or datatype found"); + } + if (value == io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser.AUTODETECT_DATE_DATATYPE) { + return RdfaParser.AUTODETECT_DATE_DATATYPE; + } + return resolveTermOrCurieOrAbsIri(value); + } + + /** + * Resolves @about or @resource according to RDFa Core 1.1 section 5 + * + * @param value value of attribute + * @return resource IRI + * @throws MalformedIriException if IRI can not be resolved + */ + String resolveAboutOrResource(String value) throws MalformedIriException { + String result = documentContext.resolveBNode(value); + if (result != null) { + return result; + } + return resolveCurieOrIri(value, false); + } + + /** + * Resolves @role according to Role Attribute 1.0 section 4 + * @param value value of attribute + * @return role IRI + * @throws MalformedIriException if role can not be resolved + */ + String resolveRole(String value) throws MalformedIriException { + if (TERM_PATTERN.matcher(value).matches()) { + return XHTML_VOCAB + value; + } + return resolveCurieOrIri(value, true); + } + + /** + * Resolves TERMorCURIEorAbsIRI according to RDFa Core 1.1 section A + * @param value value to be resolved + * @return resource IRI + * @throws MalformedIriException if IRI can not be resolved + */ + private String resolveTermOrCurieOrAbsIri(String value) throws MalformedIriException { + if (TERM_PATTERN.matcher(value).matches()) { + if (vocab == null && documentContext.rdfaVersion > RDFa.VERSION_10 && "describedby".equals(value)) { + return POWDER_DESCRIBED_BY; + } + String term; + if (vocab != null) { + term = vocab.resolveTerm(value); + } else { + term = resolveXhtmlTerm(value); + } + if (term == null) { + documentContext.parser.warning(RDFa.UNRESOLVED_TERM, CAN_NOT_RESOLVE_TERM + value); + throw new MalformedIriException(CAN_NOT_RESOLVE_TERM + value); + } + return term; + } + return resolveCurieOrIri(value, true); + } + + Iterable expand(String pred) { + if (vocab == null) { + return Collections.EMPTY_LIST; + } + return vocab.expand(pred); + } + + private String resolveCurieOrIri(String curie, boolean ignoreRelIri) throws MalformedIriException { + if (!ignoreRelIri && (curie == null || curie.isEmpty())) { + return documentContext.resolveIri(curie); + } + boolean safeSyntax = curie.startsWith("[") && curie.endsWith("]"); + if (safeSyntax) { + curie = curie.substring(1, curie.length() - 1); + } + + int delimPos = curie.indexOf(':'); + if (delimPos == -1) { + if (safeSyntax || ignoreRelIri) { + throw new MalformedCurieException("CURIE with no prefix (" + curie + ") found"); + } + return documentContext.resolveIri(curie); + } + + String result = resolveMapping(curie, delimPos, safeSyntax); + if (RIUtils.isIri(result)) { + return result; + } + throw new MalformedIriException("Malformed IRI: " + curie); + } + + private String resolveMapping(String curie, int delimPos, boolean safeSyntax) throws MalformedCurieException { + String localName = curie.substring(delimPos + 1); + String prefix = curie.substring(0, delimPos); + + if (prefix.equals("_")) { + throw new MalformedCurieException("CURIE with invalid prefix (" + curie + ") found"); + } + + if (!iriMappings.containsKey(prefix)) { + if (documentContext.rdfaVersion > RDFa.VERSION_10 && RDFA11_INITIAL_CONTEXT.containsKey(prefix)) { + String nsUri = RDFA11_INITIAL_CONTEXT.get(prefix); + iriMappings.put(prefix, nsUri); + String result = nsUri + localName; + if (RIUtils.isIri(result)) { + return result; + } + throw new MalformedCurieException("Malformed CURIE (" + curie + ")"); + } + if (!safeSyntax && RIUtils.isIri(curie)) { + return curie; + } + throw new MalformedCurieException("CURIE with unresolvable prefix found (" + curie + ")"); + } + return iriMappings.get(prefix) + localName; + } + + private static String resolveXhtmlTerm(String predicate) { + for (String link : XHTML_VOCAB_PROPS) { + if (link.equalsIgnoreCase(predicate)) { + return XHTML_VOCAB + link; + } + } + return null; + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/RdfaParser.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/RdfaParser.java new file mode 100644 index 00000000..0c33c95d --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/RdfaParser.java @@ -0,0 +1,1359 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf.rdfa; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler; +import io.github.sparqlanything.html.org.semarglproject.rdf.RdfXmlParser; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.DocumentContext; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.EvalContext; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.VocabManager; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedCurieException; +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; +import io.github.sparqlanything.html.org.semarglproject.ri.RIUtils; +import io.github.sparqlanything.html.org.semarglproject.sink.Pipe; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.sink.XmlSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDFa; +import io.github.sparqlanything.html.org.semarglproject.vocab.XSD; +import io.github.sparqlanything.html.org.semarglproject.xml.XmlUtils; +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import javax.xml.bind.DatatypeConverter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Deque; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Implementation of streaming RDFa (1.0 and + * 1.1) parser. Supports HTML4, HTML5, XHTML1, + * XHTML5, XML and SVG inputs. Provides RDFa version and document syntax autodetection. + * + *
+ * List of supported options: + *
    + *
  • {@link #RDFA_VERSION_PROPERTY}
  • + *
  • {@link StreamProcessor#PROCESSOR_GRAPH_HANDLER_PROPERTY}
  • + *
  • {@link #ENABLE_OUTPUT_GRAPH}
  • + *
  • {@link #ENABLE_PROCESSOR_GRAPH}
  • + *
  • {@link #ENABLE_VOCAB_EXPANSION}
  • + *
+ */ +public final class RdfaParser extends Pipe implements XmlSink, TripleSink, ProcessorGraphHandler { + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * RDFa version compatibility. Allowed values are {@link RDFa#VERSION_10} and {@link RDFa#VERSION_11}. + */ + public static final String RDFA_VERSION_PROPERTY = + "http://semarglproject.org/rdfa/properties/version"; + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Enables or disables generation of triples from output graph. + */ + public static final String ENABLE_OUTPUT_GRAPH = + "http://semarglproject.org/rdfa/properties/enable-output-graph"; + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Enables or disables generation of triples from processor graph. + * ProcessorGraphHandler will receive events regardless of this option. + */ + public static final String ENABLE_PROCESSOR_GRAPH = + "http://semarglproject.org/rdfa/properties/enable-processor-graph"; + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Enables or disables vocabulary + * expansion feature. + */ + public static final String ENABLE_VOCAB_EXPANSION = + "http://semarglproject.org/rdfa/properties/enable-vocab-expansion"; + + static final String AUTODETECT_DATE_DATATYPE = "AUTODETECT_DATE_DATATYPE"; + + private static final ThreadLocal VOCAB_MANAGER = new ThreadLocal() { + @Override + protected VocabManager initialValue() { + return new VocabManager(); + } + }; + + // flag used in incomplTriple list to indicate that following element should be + // treated as having @rev relation instead of @rel + private static final String REVERSED_TRIPLE_FLAG = null; + // flag used in listMapping list to indicate that following two elements represent literal object + // that allows to save some GC time and avoid creating literal objects hierarchy with following instanceof checks + private static final String LITERAL_OBJECT_FLAG = null; + + private static final String BODY = "body"; + private static final String HEAD = "head"; + private static final String VERSION = "version"; + private static final String METADATA = "metadata"; + + private static final String PLAIN_LITERAL = ""; + private static final String XHTML_DEFAULT_XMLNS = "http://www.w3.org/1999/xhtml"; + + private static final String XHTML_VOCAB = "http://www.w3.org/1999/xhtml/vocab#"; + + // html5 support + private static final String DATETIME_ATTR = "datetime"; + private static final String TIME_QNAME = "time"; + private static final String VALUE_ATTR = "value"; + private static final String DATA_ATTR = "data"; + private static final String XML_BASE = "xml:base"; + + // keys for coalesce method + private static final String BASE_IF_HEAD_OR_BODY = "bihob"; + private static final String BASE_IF_ROOT_NODE = "birn"; + private static final String PARENT_OBJECT = "poie"; + private static final String BNODE_IF_TYPEOF = RDFa.TYPEOF_ATTR; + + private Deque contextStack = null; + + private StringBuilder xmlString = null; + private List xmlStringPred = null; + private String xmlStringSubj = null; + + private Short forcedRdfaVersion = null; + private boolean sinkOutputGraph; + private boolean sinkProcessorGraph; + + private boolean expandVocab; + private final DocumentContext dh; + private final Splitter splitter; + private Locator locator = null; + + private ProcessorGraphHandler processorGraphHandler = null; + + private boolean rdfXmlInline = false; + private XmlSink rdfXmlParser = null; + + private Map> patternProps = new HashMap>(); + private List copyingPairs = new ArrayList(); + + private final Map overwriteMappings = new HashMap(); + + private RdfaParser(TripleSink sink) { + super(sink); + contextStack = new LinkedList(); + dh = new DocumentContext(this); + splitter = new Splitter(); + sinkProcessorGraph = true; + sinkOutputGraph = true; + expandVocab = false; + } + + /** + * Creates instance of RdfaParser connected to specified sink + * @param sink sink to be connected to + * @return instance of RdfaParser + */ + public static XmlSink connect(TripleSink sink) { + return new RdfaParser(sink); + } + + @Override + public void startDocument() { + EvalContext initialContext = EvalContext.createInitialContext(dh); + initialContext.iriMappings.put("", XHTML_VOCAB); + contextStack.push(initialContext); + + xmlString = null; + xmlStringPred = null; + xmlStringSubj = null; + + rdfXmlInline = false; + rdfXmlParser = null; + } + + @Override + public void endDocument() throws SAXException { + if (sinkOutputGraph) { + Iterator iterator = copyingPairs.iterator(); + while (iterator.hasNext()) { + String subj = iterator.next(); + String pattern = iterator.next(); + if (patternProps.containsKey(pattern)) { + copyProps(subj, patternProps.get(pattern)); + } + } + + iterator = copyingPairs.iterator(); + while (iterator.hasNext()) { + iterator.next(); + String pattern = iterator.next(); + patternProps.remove(pattern); + } + for (String pattern : patternProps.keySet()) { + addNonLiteralInternal(pattern, RDF.TYPE, RDFa.PATTERN); + copyProps(pattern, patternProps.get(pattern)); + } + } + + dh.clear(); + contextStack.clear(); + patternProps.clear(); + copyingPairs.clear(); + } + + @Override + public void startElement(String nsUri, String localName, String qName, Attributes attrs) throws SAXException { + if (rdfXmlInline) { + rdfXmlParser.startElement(nsUri, localName, qName, attrs); + return; + } else if (dh.documentFormat == DocumentContext.FORMAT_SVG && localName.equals(METADATA)) { + if (rdfXmlParser == null) { + rdfXmlParser = RdfXmlParser.connect(this); + rdfXmlParser.setBaseUri(dh.base); + rdfXmlParser.startDocument(); + } + rdfXmlInline = true; + return; + } + + if (contextStack.size() < 4) { + String oldBase = dh.base; + dh.detectFormat(localName, qName, attrs.getValue(VERSION)); + dh.detectBase(qName, attrs.getValue(XML_BASE), attrs.getValue(RDFa.HREF_ATTR)); + if (!dh.base.equals(oldBase)) { + for (EvalContext ctx : contextStack) { + ctx.updateBase(oldBase, dh.base); + } + } + } + + EvalContext parent = contextStack.peek(); + if (parent.parsingLiteral) { + xmlString.append(XmlUtils.serializeOpenTag(nsUri, qName, parent.iriMappings, attrs, false)); + } + + if (dh.rdfaVersion > RDFa.VERSION_10 && attrs.getValue(RDFa.PREFIX_ATTR) != null) { + for (Iterator iterator = splitter.split(attrs.getValue(RDFa.PREFIX_ATTR)); iterator.hasNext(); ) { + String prefix = iterator.next(); + int prefixLength = prefix.length(); + if (prefixLength < 2 || prefix.charAt(prefixLength - 1) != ':' || !iterator.hasNext()) { + continue; + } + String uri = iterator.next(); + startPrefixMapping(prefix.substring(0, prefixLength - 1), uri); + } + } + + String lang = attrs.getValue(XmlUtils.XML_LANG); + if (lang == null) { + lang = attrs.getValue(XmlUtils.LANG); + } + EvalContext current = parent.initChildContext(attrs.getValue(RDFa.PROFILE_ATTR), + attrs.getValue(RDFa.VOCAB_ATTR), lang, overwriteMappings); + overwriteMappings.clear(); + + boolean skipTerms = dh.rdfaVersion > RDFa.VERSION_10 && attrs.getValue(RDFa.PROPERTY_ATTR) != null + && (dh.documentFormat == DocumentContext.FORMAT_HTML4 + || dh.documentFormat == DocumentContext.FORMAT_HTML5); + List rels = convertRelRevToList(attrs.getValue(RDFa.REL_ATTR), skipTerms); + List revs = convertRelRevToList(attrs.getValue(RDFa.REV_ATTR), skipTerms); + boolean noRelsAndRevs = rels == null && revs == null; + + boolean skipElement = findSubjectAndObject(qName, attrs, noRelsAndRevs, current, parent); + + // don't fill parent list if subject was changed at this + // or previous step by current.parentObject + if (dh.rdfaVersion > RDFa.VERSION_10 && current.subject != null && (!current.subject.equals(parent.object) + || parent.subject != null && !parent.subject.equals(parent.object))) { + // RDFa Core 1.1 processing sequence step 8 + current.listMapping = new HashMap>(); + } + + processRels(attrs, rels, current); + processRevs(revs, current); + + if (current.object == null && !noRelsAndRevs) { + current.object = dh.createBnode(false); + } + + processPropertyAttr(qName, attrs, current, parent, noRelsAndRevs); + + if (dh.rdfaVersion > RDFa.VERSION_10) { + processRoleAttribute(attrs.getValue(RDFa.ID_ATTR), attrs.getValue(RDFa.ROLE_ATTR), current); + } + + if (!skipElement) { + // RDFa Core 1.0 processing sequence step 10 + // RDFa Core 1.1 processing sequence step 12 + processIncompleteTriples(current, parent); + } + + // RDFa Core 1.0 processing sequence step 11 + // RDFa Core 1.1 processing sequence step 13 + pushContext(current, parent, skipElement); + } + + /** + * Splits @rel or @rev attribute value to list of predicates. Terms can be optionally ignored. + * @param propertyVal value of @rel or @rev attribute + * @param skipTerms is terms should be skipped + * @return list of predicates + */ + private List convertRelRevToList(String propertyVal, boolean skipTerms) { + if (propertyVal == null) { + return null; + } + List result = new ArrayList(); + Iterator iterator = splitter.split(propertyVal); + while (splitter.hasNext()) { + String pred = iterator.next(); + if (skipTerms && pred.indexOf(':') == -1) { + continue; + } + result.add(pred); + } + if (skipTerms && result.isEmpty()) { + result = null; + } + return result; + } + + /** + * Generates triples related to @role attribute + * @param id value of @id attribute + * @param roleVal value of @role attribute + * @param current current context + */ + private void processRoleAttribute(String id, String roleVal, EvalContext current) { + if (roleVal == null) { + return; + } + String subject; + if (id != null) { + subject = dh.base + '#' + id; + } else { + subject = dh.createBnode(true); + } + Iterator iterator = splitter.split(roleVal); + while (splitter.hasNext()) { + try { + String role = current.resolveRole(iterator.next()); + addNonLiteral(subject, XHTML_VOCAB + "role", role); + } catch (MalformedIriException e) { + // do nothing + } + } + } + + /** + * Determines object and subject for current context + * @param qName node's qName + * @param attrs node's attributes + * @param noRelAndRev is no @rel and @rev attributes specified + * @param current current context + * @param parent parent context + * @return skip element flag + */ + private boolean findSubjectAndObject(String qName, Attributes attrs, boolean noRelAndRev, EvalContext current, + EvalContext parent) { + String newSubject = null; + try { + if (dh.rdfaVersion > RDFa.VERSION_10) { + if (noRelAndRev) { + // RDFa Core 1.1 processing sequence step 5 + if (attrs.getValue(RDFa.PROPERTY_ATTR) != null && attrs.getValue(RDFa.CONTENT_ATTR) == null + && attrs.getValue(VALUE_ATTR) == null && attrs.getValue(RDFa.DATATYPE_ATTR) == null) { + // RDFa Core 1.1 processing sequence step 5.1 + current.subject = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, + BASE_IF_ROOT_NODE, PARENT_OBJECT); + + if (attrs.getValue(RDFa.TYPEOF_ATTR) != null) { + current.object = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, BASE_IF_ROOT_NODE, + RDFa.RESOURCE_ATTR, DATA_ATTR, RDFa.HREF_ATTR, RDFa.SRC_ATTR, BNODE_IF_TYPEOF); + newSubject = current.object; + } + } else { + // RDFa Core 1.1 processing sequence step 5.2 + current.subject = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, + RDFa.RESOURCE_ATTR, DATA_ATTR, RDFa.HREF_ATTR, RDFa.SRC_ATTR, BASE_IF_ROOT_NODE, + BNODE_IF_TYPEOF, PARENT_OBJECT); + if (attrs.getValue(RDFa.TYPEOF_ATTR) != null) { + newSubject = current.subject; + } + } + } else { + // RDFa Core 1.1 processing sequence step 6 + current.object = coalesce(qName, attrs, parent, current, RDFa.RESOURCE_ATTR, DATA_ATTR, + RDFa.HREF_ATTR, RDFa.SRC_ATTR); + current.subject = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, + BASE_IF_ROOT_NODE, PARENT_OBJECT); + if (attrs.getValue(RDFa.TYPEOF_ATTR) != null) { + if (attrs.getValue(RDFa.ABOUT_ATTR) != null) { + newSubject = current.subject; + } else { + if (current.object == null) { + current.object = dh.createBnode(noRelAndRev); + } + newSubject = current.object; + } + } + } + } else { + if (noRelAndRev) { + // RDFa Core 1.0 processing sequence step 4 + current.subject = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, RDFa.SRC_ATTR, + RDFa.RESOURCE_ATTR, RDFa.HREF_ATTR, BASE_IF_HEAD_OR_BODY, BNODE_IF_TYPEOF, PARENT_OBJECT); + } else { + // RDFa Core 1.0 processing sequence step 5 + current.subject = coalesce(qName, attrs, parent, current, RDFa.ABOUT_ATTR, RDFa.SRC_ATTR, + BASE_IF_HEAD_OR_BODY, BNODE_IF_TYPEOF, PARENT_OBJECT); + current.object = coalesce(qName, attrs, parent, current, RDFa.RESOURCE_ATTR, RDFa.HREF_ATTR); + } + if (attrs.getValue(RDFa.TYPEOF_ATTR) != null) { + newSubject = current.subject; + } + } + } catch (MalformedIriException e) { + warning(RDFa.WARNING, e.getMessage()); + pushContextNoLiteral(current, parent); + } + + if (newSubject != null) { + // RDFa Core 1.0 processing sequence step 6 + // RDFa Core 1.1 processing sequence step 7 + Iterator iterator = splitter.split(attrs.getValue(RDFa.TYPEOF_ATTR)); + while (splitter.hasNext()) { + try { + String iri = current.resolvePredOrDatatype(iterator.next()); + addNonLiteral(newSubject, RDF.TYPE, iri); + } catch (MalformedIriException e) { + // do nothing + } + } + } + return noRelAndRev && attrs.getValue(RDFa.PROPERTY_ATTR) == null + && (current.subject == null && parent.object == null || current.subject.equals(parent.object)); + } + + /** + * Iterates through attribute names list and returns first not null + * value of attribute with such name. Also processes special cases + * if no such attributes found: + *
    + *
  • {@link #BNODE_IF_TYPEOF} - returns new bnode if typeof attr found
  • + *
  • {@link #PARENT_OBJECT} - returns parent.object
  • + *
  • {@link #BASE_IF_HEAD_OR_BODY} - returns base if processing head or body node in HTML
  • + *
+ * + * @param tagName name of processed element + * @param attrs attribute list + * @param parent parent context + * @param current current context + * @param attrNames prioritized list of attributes + * @throws MalformedIriException + */ + private String coalesce(String tagName, Attributes attrs, EvalContext parent, + EvalContext current, String... attrNames) throws MalformedIriException { + for (String attr : attrNames) { + if (attrs.getValue(attr) != null) { + if (attr.equals(RDFa.ABOUT_ATTR) || attr.equals(RDFa.RESOURCE_ATTR)) { + String val = attrs.getValue(attr); + if (val.equals("[]")) { + continue; + } + try { + return current.resolveAboutOrResource(val); + } catch (MalformedCurieException e) { + warning(RDFa.UNRESOLVED_CURIE, e.getMessage()); + return null; + } + } else if (attr.equals(RDFa.HREF_ATTR) || attr.equals(RDFa.SRC_ATTR) || attr.equals(DATA_ATTR)) { + return dh.resolveIri(attrs.getValue(attr)); + } else if (attr.equals(BNODE_IF_TYPEOF)) { + return dh.createBnode(false); + } + } else if (attr.equals(PARENT_OBJECT) && parent.object != null) { + return parent.object; + } else { + boolean isHeadOrBody = tagName.equals(HEAD) || tagName.equals(BODY); + boolean isRoot = contextStack.size() == 1 || attrs.getValue(RDFa.TYPEOF_ATTR) != null && isHeadOrBody; + if (isHeadOrBody && attr.equals(BASE_IF_HEAD_OR_BODY) || isRoot && attr.equals(BASE_IF_ROOT_NODE)) { + return dh.base; + } + } + } + return null; + } + + /** + * Generates [incompleted] triples with predicates from @rel attribute + * @param attrs node's attributes + * @param rels list of predicates from @rel attribute + * @param current current context + */ + private void processRels(Attributes attrs, List rels, EvalContext current) { + if (rels != null) { + boolean inList = dh.rdfaVersion > RDFa.VERSION_10 && attrs.getValue(RDFa.INLIST_ATTR) != null; + // RDFa Core 1.1 processing sequence steps 9 and 10 + // RDFa Core 1.0 processing sequence steps 7 and 8 + for (String predicate : rels) { + String iri; + try { + iri = current.resolvePredOrDatatype(predicate); + } catch (MalformedIriException e) { + continue; + } + if (inList) { + List list = current.getMappingForIri(iri); + if (current.object != null) { + list.add(current.object); + } else { + current.incomplTriples.add(list); + } + } else { + if (current.object != null) { + addNonLiteral(current.subject, iri, current.object); + } else { + current.incomplTriples.add(iri); + } + } + } + } + } + + /** + * Generates [incompleted] triples with predicates from @rev attribute + * @param revs list of predicates from @rev attribute + * @param current current context + */ + private void processRevs(List revs, EvalContext current) { + if (revs != null) { + for (String predicate : revs) { + // RDFa Core 1.1 processing sequence steps 9 and 10 + try { + String iri = current.resolvePredOrDatatype(predicate); + if (current.object != null) { + addNonLiteral(current.object, iri, current.subject); + } else { + current.incomplTriples.add(REVERSED_TRIPLE_FLAG); + current.incomplTriples.add(iri); + } + } catch (MalformedIriException e) { + // do nothing + } + } + } + } + + /** + * Processes @property attribute of specified node + * @param qName node's QName + * @param attrs node's attributes + * @param current current context + * @param parent parent context + * @param noRelsAndRevs are on @rel and @rev attributes specified + */ + private void processPropertyAttr(String qName, Attributes attrs, EvalContext current, + EvalContext parent, boolean noRelsAndRevs) { + if (attrs.getValue(RDFa.PROPERTY_ATTR) == null) { + current.parsingLiteral = false; + return; + } + + // RDFa Core 1.0 processing sequence step 9 + // RDFa Core 1.1 processing sequence step 11 + parseLiteralObject(qName, attrs, current, parent, noRelsAndRevs); + + // noinspection StringEquality + current.parsingLiteral = current.objectLitDt == RDF.XML_LITERAL; + if (current.properties == null) { + current.objectLitDt = null; + current.parsingLiteral = false; + } + } + + /** + * Determines literal object for specified node. Can change objectLitDt in current context + * @param qName node's QName + * @param attrs node's attributes + * @param current current context + * @param parent parent context + * @param noRelsAndRevs are on @rel and @rev attributes specified + */ + private void parseLiteralObject(String qName, Attributes attrs, EvalContext current, + EvalContext parent, boolean noRelsAndRevs) { + String content = parseContent(attrs); + String langOrDt = parseDatatype(qName, attrs, current); + + if (langOrDt != null && !RDF.XML_LITERAL.equals(langOrDt)) { + // RDFa Core 1.0 processing sequence step 9, typed literal case + // RDFa Core 1.1 processing sequence step 11, typed literal case + if (content != null) { + langOrDt = resolveLangOrDt(content, langOrDt, current); + } else { + current.objectLitDt = langOrDt; + langOrDt = null; + } + } else if (content != null) { + // RDFa Core 1.0 processing sequence step 9, plain literal case + // RDFa Core 1.1 processing sequence step 11, plain literal using @content case + langOrDt = current.lang; + } else if (langOrDt == null && dh.rdfaVersion > RDFa.VERSION_10) { + if (attrs.getValue(RDFa.CONTENT_ATTR) == null && attrs.getValue(VALUE_ATTR) == null && noRelsAndRevs) { + // RDFa Core 1.1 processing sequence step 11, no rel or rev or content case + try { + langOrDt = coalesce(qName, attrs, parent, current, + RDFa.RESOURCE_ATTR, DATA_ATTR, RDFa.HREF_ATTR, RDFa.SRC_ATTR); + } catch (MalformedIriException e) { + warning(RDFa.WARNING, e.getMessage()); + pushContextNoLiteral(current, parent); + } + } + if (langOrDt == null) { + if (attrs.getValue(RDFa.ABOUT_ATTR) == null && attrs.getValue(RDFa.TYPEOF_ATTR) != null) { + // RDFa Core 1.1 processing sequence step 11, @typeof present and @about is not case + langOrDt = current.object; + if (current.object == null) { + // RDFa Core 1.1 processing sequence step 11, last plain literal case + current.objectLitDt = PLAIN_LITERAL; + } + } else { + // RDFa Core 1.1 processing sequence step 11, last plain literal case + current.objectLitDt = PLAIN_LITERAL; + } + } + } else { + if (langOrDt == null || langOrDt.length() > 0) { + // RDFa Core 1.0 processing sequence step 9, xml literal case + // RDFa Core 1.1 processing sequence step 11, xml literal case + current.objectLitDt = RDF.XML_LITERAL; + } else { + // RDFa Core 1.0 processing sequence step 9, plain literal case + // RDFa Core 1.1 processing sequence step 11, plain literal case + current.objectLitDt = PLAIN_LITERAL; + } + langOrDt = null; + } + boolean inList = attrs.getValue(RDFa.INLIST_ATTR) != null; + processPropertyPredicate(attrs, content, langOrDt, current, inList); + } + + /** + * Extracts content for specified node with respect of HTML5 attributes + * @param attrs node's attributes + * @return content + */ + private String parseContent(Attributes attrs) { + String content = attrs.getValue(RDFa.CONTENT_ATTR); + if (content == null && dh.documentFormat == DocumentContext.FORMAT_HTML5) { + if (attrs.getValue(VALUE_ATTR) != null) { + content = attrs.getValue(VALUE_ATTR); + } + if (attrs.getValue(DATETIME_ATTR) != null) { + content = attrs.getValue(DATETIME_ATTR); + } + } + return content; + } + + /** + * Extracts datatype uri for specified node + * @param qName node's QName + * @param attrs node's attributes + * @param current current context + * @return datatype URI or {@link #AUTODETECT_DATE_DATATYPE} if datatype should be detected at validation phase + */ + private String parseDatatype(String qName, Attributes attrs, EvalContext current) { + String datatype = attrs.getValue(RDFa.DATATYPE_ATTR); + if (dh.documentFormat == DocumentContext.FORMAT_HTML5) { + if (attrs.getValue(DATETIME_ATTR) != null) { + if (datatype == null) { + datatype = AUTODETECT_DATE_DATATYPE; + } + } else if (qName.equals(TIME_QNAME) && datatype == null) { + datatype = AUTODETECT_DATE_DATATYPE; + } + } + try { + if (datatype != null && datatype.length() > 0) { + datatype = current.resolvePredOrDatatype(datatype); + } + } catch (MalformedIriException e) { + datatype = null; + } + return datatype; + } + + /** + * Generates triples corresponding to specified object and predicates from @property attribute + * @param attrs node's attributes + * @param content objects's content + * @param langOrUri object's content lang or datatype (if literal) or object's URI + * @param current current context + * @param inList is inlist property presented + */ + private void processPropertyPredicate(Attributes attrs, String content, String langOrUri, + EvalContext current, boolean inList) { + Iterator iterator = splitter.split(attrs.getValue(RDFa.PROPERTY_ATTR)); + while (splitter.hasNext()) { + String iri; + try { + iri = current.resolvePredOrDatatype(iterator.next()); + } catch (MalformedIriException e) { + continue; + } + if (content != null || langOrUri != null) { + if (dh.rdfaVersion > RDFa.VERSION_10 && inList) { + List list = current.getMappingForIri(iri); + if (content != null) { + list.add(LITERAL_OBJECT_FLAG); + list.add(content); + list.add(langOrUri); + } else { + list.add(langOrUri); + } + } else { + if (content != null) { + addLiteralTriple(current.subject, iri, content, langOrUri); + } else { + addNonLiteral(current.subject, iri, langOrUri); + } + } + } else if (current.properties == null) { + current.properties = new ArrayList(); + if (dh.rdfaVersion > RDFa.VERSION_10 && inList) { + current.properties.add(RDFa.INLIST_ATTR); + } + current.properties.add(iri); + } else { + current.properties.add(iri); + } + } + } + + private String resolveLangOrDt(String content, String dt, EvalContext current) { + if (dt == null) { + return current.lang; + } + if (dt.equals(RdfaParser.AUTODETECT_DATE_DATATYPE)) { + try { + if (content.matches("-?P\\d+Y\\d+M\\d+DT\\d+H\\d+M\\d+(\\.\\d+)?S")) { + return XSD.DURATION; + } + if (content.indexOf(':') != -1) { + if (content.indexOf('T') != -1) { + DatatypeConverter.parseDateTime(content); + return XSD.DATE_TIME; + } + DatatypeConverter.parseTime(content); + return XSD.TIME; + } + if (content.matches("-?\\d{4,}")) { + return XSD.G_YEAR; + } + if (content.matches("-?\\d{4,}-(0[1-9]|1[0-2])")) { + return XSD.G_YEAR_MONTH; + } + DatatypeConverter.parseDate(content); + return XSD.DATE; + } catch (IllegalArgumentException e) { + return current.lang; + } + } + if (dt.indexOf(':') == -1) { + return current.lang; + } + return dt; + } + + /** + * Generates triples from parent's incompleted triples list + * @param current current context + * @param parent parent context + */ + private void processIncompleteTriples(EvalContext current, EvalContext parent) { + if (current.subject == null) { + return; + } + String subject = parent.subject; + for (Iterator iti = parent.incomplTriples.iterator(); iti.hasNext(); ) { + Object predicateOrList = iti.next(); + if (predicateOrList == REVERSED_TRIPLE_FLAG) { + addNonLiteral(current.subject, (String) iti.next(), subject); + } else if (predicateOrList instanceof String) { + addNonLiteral(subject, (String) predicateOrList, current.subject); + } else { + @SuppressWarnings("unchecked") + Collection list = (Collection) predicateOrList; + list.add(current.subject); + } + } + } + + /** + * Pushes current context to stack before processing child nodes + * @param current current context + * @param parent parent context + */ + private void pushContext(EvalContext current, EvalContext parent, boolean skipElement) { + if (current.parsingLiteral) { + xmlString = new StringBuilder(); + xmlStringPred = current.properties; + xmlStringSubj = current.subject == null ? parent.subject : current.subject; + } + if (current.parsingLiteral || skipElement) { + current.subject = parent.subject; + current.object = parent.object; + current.incomplTriples = parent.incomplTriples; + current.objectLit = null; + current.objectLitDt = parent.objectLitDt; + if (current.objectLitDt != null) { + current.objectLit = ""; + } + current.properties = null; + contextStack.push(current); + } else { + pushContextNoLiteral(current, parent); + } + } + + /** + * Pushes current context to stack before processing child nodes when no literals are parsed + * @param current current context + * @param parent parent context + */ + private void pushContextNoLiteral(EvalContext current, EvalContext parent) { + if (current.subject == null) { + current.subject = parent.subject; + } + if (current.object == null) { + current.object = current.subject; + } + if (current.objectLitDt != null || parent.objectLitDt != null) { + current.objectLit = ""; + } + contextStack.push(current); + } + + @Override + public void endElement(String nsUri, String localName, String qName) throws SAXException { + if (rdfXmlInline) { + // delegate parsing to RDF/XML parser + if (dh.documentFormat == DocumentContext.FORMAT_SVG && localName.equals(METADATA)) { + rdfXmlParser.endDocument(); + rdfXmlParser = null; + rdfXmlInline = false; + } else { + rdfXmlParser.endElement(nsUri, localName, qName); + } + return; + } + + EvalContext current = contextStack.pop(); + processXmlString(current); + + // serialize close tag if parsing literal + if (xmlString != null) { + xmlString.append("'); + } + + if (contextStack.isEmpty()) { + return; + } + + EvalContext parent = contextStack.peek(); + processContent(current, parent); + + // noinspection ObjectEquality + if (parent.listMapping != current.listMapping) { + // current mapping isn't inherited from parent + // RDFa Core 1.0 processing sequence step 14 + processListMappings(current); + } + } + + /** + * Generates triples for parsed literal if it present + * @param current current context + */ + private void processXmlString(EvalContext current) { + if (current.parsingLiteral && xmlString != null) { + String content = xmlString.toString(); + xmlString = null; + if (dh.rdfaVersion == RDFa.VERSION_10 && content.indexOf('<') == -1) { + for (String pred : xmlStringPred) { + addPlainLiteral(xmlStringSubj, pred, content, current.lang); + } + } else { + for (String pred : xmlStringPred) { + addTypedLiteral(xmlStringSubj, pred, content, RDF.XML_LITERAL); + } + } + } + } + + /** + * Generates triples for node content + * @param current current context + * @param parent parent context + */ + private void processContent(EvalContext current, EvalContext parent) { + String content = current.objectLit; + if (content == null) { + return; + } + if (!parent.parsingLiteral && parent.objectLit != null) { + parent.objectLit += content; + } + if (current.properties == null) { + return; + } + + String dt = current.objectLitDt; + boolean inlist = RDFa.INLIST_ATTR.equals(current.properties.get(0)); + + if (inlist) { + String langOrDt = resolveLangOrDt(content, dt, current); + current.properties.remove(0); + for (String predIri : current.properties) { + List mappingForIri = current.getMappingForIri(predIri); + mappingForIri.add(LITERAL_OBJECT_FLAG); + mappingForIri.add(content); + mappingForIri.add(langOrDt); + } + } else { + for (String predIri : current.properties) { + dt = resolveLangOrDt(content, dt, current); + addLiteralTriple(current.subject, predIri, content, dt); + } + } + } + + /** + * Generates triples from list mappings on node close event + * @param current current context + */ + private void processListMappings(EvalContext current) { + Map> list = current.listMapping; + for (String pred : list.keySet()) { + String prev = null; + String start = null; + for (Iterator iterator = list.get(pred).iterator(); iterator.hasNext(); ) { + String res = iterator.next(); + String child = dh.createBnode(false); + // noinspection StringEquality + if (res == LITERAL_OBJECT_FLAG) { + String content = iterator.next(); + String langOrDt = iterator.next(); + addLiteralTriple(child, RDF.FIRST, content, langOrDt); + } else { + addNonLiteral(child, RDF.FIRST, res); + } + if (prev == null) { + start = child; + } else { + addNonLiteral(prev, RDF.REST, child); + } + prev = child; + } + if (start == null) { + addNonLiteral(current.subject, pred, RDF.NIL); + } else { + addNonLiteral(prev, RDF.REST, RDF.NIL); + addNonLiteral(current.subject, pred, start); + } + } + list.clear(); + } + + @Override + public void characters(char[] buffer, int start, int length) throws SAXException { + if (rdfXmlInline) { + rdfXmlParser.characters(buffer, start, length); + return; + } + EvalContext parent = contextStack.peek(); + if (xmlString != null) { + xmlString.append(buffer, start, length); + } + if (parent.objectLit != null) { + parent.addContent(String.copyValueOf(buffer, start, length)); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (rdfXmlInline) { + rdfXmlParser.ignorableWhitespace(ch, start, length); + } + } + + @Override + public void startDTD(String name, String publicId, String systemId) throws SAXException { + dh.processDtd(name, publicId, systemId); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + if (rdfXmlInline) { + rdfXmlParser.startPrefixMapping(prefix, uri); + return; + } + // TODO: check for valid prefix + if (prefix.length() == 0 && XHTML_DEFAULT_XMLNS.equalsIgnoreCase(uri)) { + overwriteMappings.put(prefix, XHTML_VOCAB); + } else { + try { + overwriteMappings.put(prefix, RIUtils.resolveIri(dh.originUri, uri)); + } catch (MalformedIriException e) { + // do nothing + } + } + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + if (rdfXmlInline) { + rdfXmlParser.endPrefixMapping(prefix); + } + } + + @Override + public boolean setPropertyInternal(String key, Object value) { + if (ENABLE_OUTPUT_GRAPH.equals(key) && value instanceof Boolean) { + sinkOutputGraph = (Boolean) value; + } else if (getRdfaVersion() != RDFa.VERSION_10 && ENABLE_PROCESSOR_GRAPH.equals(key) + && value instanceof Boolean) { + sinkProcessorGraph = (Boolean) value; + forcedRdfaVersion = RDFa.VERSION_11; + } else if (getRdfaVersion() != RDFa.VERSION_10 && ENABLE_VOCAB_EXPANSION.equals(key) + && value instanceof Boolean) { + expandVocab = (Boolean) value; + forcedRdfaVersion = RDFa.VERSION_11; +// } else if (sinkProcessorGraph || expandVocab) { +// forcedRdfaVersion = RDFa.VERSION_11; + } else if (RDFA_VERSION_PROPERTY.equals(key) && value instanceof Short) { + short rdfaVersion = (Short) value; + if (rdfaVersion < RDFa.VERSION_10 || rdfaVersion > RDFa.VERSION_11) { + throw new IllegalArgumentException("Unsupported RDFa version"); + } + forcedRdfaVersion = rdfaVersion; + dh.rdfaVersion = forcedRdfaVersion; + if (rdfaVersion < RDFa.VERSION_11) { + sinkProcessorGraph = false; + expandVocab = false; + } else { + sinkProcessorGraph = true; + expandVocab = true; + } + } else if (StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY.equals(key) + && value instanceof ProcessorGraphHandler) { + processorGraphHandler = (ProcessorGraphHandler) value; + return false; + } else { + return false; + } + return true; + } + + @Override + public void setBaseUri(String baseUri) { + dh.setBaseUri(baseUri); + } + + /** + * Loads vocabulary from specified URL. Vocabulary will not contain terms in case when + * vocabulary expansion is disabled. + * + * @param vocabUrl URL to load from + * @return loaded vocabulary (can be cached) + */ + Vocabulary loadVocabulary(String vocabUrl) { + if (sinkOutputGraph) { + sink.addNonLiteral(dh.base, RDFa.USES_VOCABULARY, vocabUrl); + } + return VOCAB_MANAGER.get().findVocab(vocabUrl, expandVocab); + } + + // error handling + + @Override + public void info(String infoClass, String message) { + addProcessorGraphRecord(infoClass, message); + if (processorGraphHandler != null) { + processorGraphHandler.info(infoClass, message); + } + } + + @Override + public void warning(String warningClass, String message) { + addProcessorGraphRecord(warningClass, message); + if (processorGraphHandler != null) { + processorGraphHandler.warning(warningClass, message); + } + } + + @Override + public void error(String errorClass, String message) { + addProcessorGraphRecord(errorClass, message); + if (processorGraphHandler != null) { + processorGraphHandler.error(errorClass, message); + } + } + + private void addProcessorGraphRecord(String recordClass, String recordContext) { + if (dh.rdfaVersion > RDFa.VERSION_10 && sinkProcessorGraph) { + String errorNode = dh.createBnode(true); + String location = ""; + if (locator != null) { + location = " at " + locator.getLineNumber() + ':' + locator.getColumnNumber(); + } + sink.addNonLiteral(errorNode, RDF.TYPE, recordClass); + sink.addPlainLiteral(errorNode, RDFa.CONTEXT, recordContext + location, null); + } + } + + @Override + public ParseException processException(SAXException e) { + Throwable cause = e.getCause(); + if (cause instanceof ParseException) { + error(RDFa.ERROR, cause.getMessage()); + return (ParseException) cause; + } + error(RDFa.ERROR, e.getMessage()); + return new ParseException(e); + } + + private void copyProps(String subj, List props) { + Iterator iterator = props.iterator(); + while (iterator.hasNext()) { + String type = iterator.next(); + if (type == null) { + addNonLiteralInternal(subj, iterator.next(), iterator.next()); + } else if (type.isEmpty()) { + addPlainLiteralInternal(subj, iterator.next(), iterator.next(), iterator.next()); + } else { + addTypedLiteralInternal(subj, iterator.next(), iterator.next(), type); + } + } + } + + // proxying TripleSink calls to filter output graph + + private void addLiteralTriple(String subject, String pred, String content, String langOrDt) { + if (langOrDt == null || langOrDt.length() < 6 || langOrDt.indexOf(':') == -1) { + addPlainLiteral(subject, pred, content, langOrDt); + } else { + addTypedLiteral(subject, pred, content, langOrDt); + } + } + + @Override + public void addNonLiteral(String subj, String pred, String obj) { + if (!sinkOutputGraph) { + return; + } + if (obj.equals(RDFa.PATTERN)) { + if (!patternProps.containsKey(subj)) { + patternProps.put(subj, new ArrayList()); + } + return; + // TODO: check vocab expansion + } else if (pred.equals(RDFa.COPY)) { + if (patternProps.containsKey(obj)) { + copyProps(subj, patternProps.get(obj)); + } else { + copyingPairs.add(subj); + copyingPairs.add(obj); + } + return; + } else if (patternProps.containsKey(subj)) { + List props = patternProps.get(subj); + props.add(null); + props.add(pred); + props.add(obj); + return; + } + addNonLiteralInternal(subj, pred, obj); + } + + private void addNonLiteralInternal(String subj, String pred, String obj) { + if (!expandVocab) { + sink.addNonLiteral(subj, pred, obj); + return; + } + addNonLiteralWithObjExpansion(subj, pred, obj); + for (String predSynonym : contextStack.peek().expand(pred)) { + addNonLiteralWithObjExpansion(subj, predSynonym, obj); + } + } + + private void addNonLiteralWithObjExpansion(String subj, String pred, String obj) { + if (obj.startsWith(RDF.BNODE_PREFIX)) { + sink.addNonLiteral(subj, pred, obj); + return; + } + sink.addNonLiteral(subj, pred, obj); + for (String objSynonym : contextStack.peek().expand(obj)) { + sink.addNonLiteral(subj, pred, objSynonym); + } + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang) { + if (!sinkOutputGraph) { + return; + } + if (patternProps.containsKey(subj)) { + List props = patternProps.get(subj); + props.add(""); + props.add(pred); + props.add(content); + props.add(lang); + return; + } + addPlainLiteralInternal(subj, pred, content, lang); + } + + private void addPlainLiteralInternal(String subj, String pred, String content, String lang) { + sink.addPlainLiteral(subj, pred, content, lang); + for (String predSynonym : contextStack.peek().expand(pred)) { + sink.addPlainLiteral(subj, predSynonym, content, lang); + } + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type) { + if (!sinkOutputGraph) { + return; + } + if (patternProps.containsKey(subj)) { + List props = patternProps.get(subj); + props.add(type); + props.add(pred); + props.add(content); + return; + } + addTypedLiteralInternal(subj, pred, content, type); + } + + private void addTypedLiteralInternal(String subj, String pred, String content, String type) { + sink.addTypedLiteral(subj, pred, content, type); + for (String predSynonym : contextStack.peek().expand(pred)) { + sink.addTypedLiteral(subj, predSynonym, content, type); + } + } + + @Override + public void setDocumentLocator(Locator locator) { + this.locator = locator; + } + + // ignored events + + @Override + public void processingInstruction(String target, String data) throws SAXException { + } + + @Override + public void skippedEntity(String name) throws SAXException { + } + + @Override + public void startEntity(String s) throws SAXException { + } + + @Override + public void endEntity(String s) throws SAXException { + } + + @Override + public void startCDATA() throws SAXException { + } + + @Override + public void endCDATA() throws SAXException { + } + + @Override + public void comment(char[] chars, int i, int i1) throws SAXException { + } + + @Override + public void endDTD() throws SAXException { + } + + short getRdfaVersion() { + if (forcedRdfaVersion == null) { + return RDFa.VERSION_11; + } + return forcedRdfaVersion; + } + + private static final class Splitter implements Iterator { + private int pos = -1; + private int length = -1; + private String string = null; + + private Iterator split(String string) { + this.string = string; + length = string.length(); + pos = 0; + while (pos < length && XmlUtils.WHITESPACE.get(string.charAt(pos))) { + pos++; + } + return this; + } + + @Override + public boolean hasNext() { + return pos < length; + } + + @Override + public String next() { + int start = pos; + while (pos < length && !XmlUtils.WHITESPACE.get(string.charAt(pos))) { + pos++; + } + if (start == pos) { + throw new NoSuchElementException(); + } + String result = string.substring(start, pos); + while (pos < length && XmlUtils.WHITESPACE.get(string.charAt(pos))) { + pos++; + } + if (pos == length) { + string = null; + } + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/VocabManager.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/VocabManager.java new file mode 100644 index 00000000..6425b865 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/VocabManager.java @@ -0,0 +1,39 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf.rdfa; + +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary; + +import java.util.HashMap; +import java.util.Map; + +final class VocabManager { + + private final Map vocabCache = new HashMap(); + + io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary findVocab(String vocabUrl, boolean expandVocab) { + io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary vocab = vocabCache.get(vocabUrl); + if (vocab != null) { + return vocab; + } + vocab = new io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.Vocabulary(vocabUrl); + vocabCache.put(vocabUrl, vocab); + if (expandVocab) { + vocab.load(); + } + return vocab; + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/Vocabulary.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/Vocabulary.java new file mode 100644 index 00000000..1176cc22 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/rdf/rdfa/Vocabulary.java @@ -0,0 +1,172 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.rdf.rdfa; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.RdfXmlParser; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser; +import io.github.sparqlanything.html.org.semarglproject.ri.RIUtils; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; +import io.github.sparqlanything.html.org.semarglproject.vocab.OWL; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDF; +import io.github.sparqlanything.html.org.semarglproject.vocab.RDFS; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +final class Vocabulary { + + private final String url; + private Map> expansions = null; + private Collection terms = null; + + Vocabulary(String url) { + this.url = url; + } + + private void addExpansion(String pred, String expansion) { + if (!expansions.containsKey(pred)) { + expansions.put(pred, new HashSet()); + } + expansions.get(pred).add(expansion); + } + + void load() { + VocabParser vocabParser = new VocabParser(); + + URL vocabUrl; + try { + vocabUrl = new URL(url); + } catch (MalformedURLException e) { + return; + } + + if (expansions == null) { + expansions = new HashMap>(); + terms = new HashSet(); + } + + StreamProcessor rdfaSp = new StreamProcessor(RdfaParser.connect(vocabParser)); + rdfaSp.setProperty(RdfaParser.ENABLE_VOCAB_EXPANSION, false); + parseVocabWithDp(vocabUrl, rdfaSp); + + if (!terms.isEmpty() || !expansions.isEmpty()) { + return; + } + + // TODO: add format detection + StreamProcessor rdfXmlSp = new StreamProcessor(RdfXmlParser.connect(vocabParser)); + rdfaSp.setProperty(RdfaParser.ENABLE_VOCAB_EXPANSION, false); + parseVocabWithDp(vocabUrl, rdfXmlSp); + + if (terms.isEmpty() && expansions.isEmpty()) { + terms = null; + expansions = null; + } + } + + private void parseVocabWithDp(URL vocabUrl, StreamProcessor streamProcessor) { + InputStream inputStream; + try { + inputStream = vocabUrl.openStream(); + } catch (IOException e) { + return; + } + InputStreamReader reader = new InputStreamReader(inputStream); + try { + streamProcessor.process(reader, url); + } catch (ParseException e) { + // do nothing + } finally { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + + Collection expand(String uri) { + if (expansions == null || !expansions.containsKey(uri)) { + return Collections.EMPTY_LIST; + } + return expansions.get(uri); + } + + String resolveTerm(String term) { + String termUri = url + term; + if (terms == null && RIUtils.isAbsoluteIri(termUri) || terms != null && terms.contains(termUri)) { + return termUri; + } + return null; + } + + private final class VocabParser implements TripleSink { + @Override + public void addNonLiteral(String subj, String pred, String obj) { + if (subj.startsWith(RDF.BNODE_PREFIX) || obj.startsWith(RDF.BNODE_PREFIX)) { + return; + } + if (pred.equals(OWL.EQUIVALENT_PROPERTY) || pred.equals(OWL.EQUIVALENT_CLASS)) { + addExpansion(subj, obj); + addExpansion(obj, subj); + terms.add(obj); + terms.add(subj); + } else if (pred.equals(RDFS.SUB_CLASS_OF) || pred.equals(RDFS.SUB_PROPERTY_OF)) { + addExpansion(subj, obj); + terms.add(obj); + terms.add(subj); + } + if (pred.equals(RDF.TYPE) && (obj.equals(RDF.PROPERTY) || obj.equals(RDFS.CLASS))) { + terms.add(subj); + } + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang) { + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type) { + } + + @Override + public void setBaseUri(String baseUri) { + } + + @Override + public void startStream() throws ParseException { + } + + @Override + public void endStream() throws ParseException { + } + + @Override + public boolean setProperty(String key, Object value) { + return false; + } + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedCurieException.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedCurieException.java new file mode 100644 index 00000000..35baac81 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedCurieException.java @@ -0,0 +1,27 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.ri; + +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; + +public final class MalformedCurieException extends MalformedIriException { + + private static final long serialVersionUID = -1077691754818847298L; + + public MalformedCurieException(String message) { + super(message); + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedIriException.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedIriException.java new file mode 100644 index 00000000..bf6f6f5e --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/MalformedIriException.java @@ -0,0 +1,25 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.ri; + +public class MalformedIriException extends Exception { + + private static final long serialVersionUID = -8791044111458438579L; + + public MalformedIriException(String message) { + super(message); + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/RIUtils.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/RIUtils.java new file mode 100644 index 00000000..52ab8eea --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/ri/RIUtils.java @@ -0,0 +1,119 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.ri; + +import io.github.sparqlanything.html.org.semarglproject.ri.MalformedIriException; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.regex.Pattern; + +/** + * Utility class. Provides methods related to resource identifiers. + */ +public final class RIUtils { + + private static final Pattern ABS_OPAQUE_IRI_PATTERN = Pattern.compile( + // scheme + "[a-zA-Z][a-zA-Z0-9+.-]*:" + // opaque part + + "[^#/][^#]*", + Pattern.DOTALL); + + private static final Pattern ABS_HIER_IRI_PATTERN = Pattern.compile( + // scheme + "[a-zA-Z][a-zA-Z0-9+.-]*:" + // user + + "/{1,3}(([^/?#@]*)@)?" + // host + + "(\\[[^@/?#]+\\]|([^@/?#:]+))" + // port + + "(:([^/?#]*))?" + // path + + "([^#?]*)?" + // query + + "(\\?([^#]*))?" + // fragment + + "(#[^#]*)?", + Pattern.DOTALL); + + private static final Pattern URN_PATTERN = Pattern.compile("urn:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:.+"); + + private RIUtils() { + } + + /** + * Resolves specified IRI. Absolute IRI are returned unmodified + * @param base base to resolve against + * @param iri IRI to be resolved + * @return resolved absolute IRI + * @throws MalformedIriException + */ + public static String resolveIri(String base, String iri) throws MalformedIriException { + if (iri == null) { + return null; + } + if (isIri(iri) || isUrn(iri)) { + return iri; + } else { + if (iri.startsWith("?") || iri.isEmpty()) { + if (base.endsWith("#")) { + return base.substring(0, base.length() - 1) + iri; + } + return base + iri; + } + String result; + try { + URL basePart = new URL(base); + result = new URL(basePart, iri).toString(); + } catch (MalformedURLException e) { + result = base + iri; + } + if (isIri(result)) { + return result; + } + throw new MalformedIriException("Malformed IRI: " + iri); + } + } + + /** + * Checks if specified string is IRI + * @param value value to check + * @return true if value is IRI + */ + public static boolean isIri(String value) { + return ABS_HIER_IRI_PATTERN.matcher(value).matches() || ABS_OPAQUE_IRI_PATTERN.matcher(value).matches(); + } + + /** + * Checks if specified string is absolute IRI + * @param value value to check + * @return true if value is absolute IRI + */ + public static boolean isAbsoluteIri(String value) { + return ABS_HIER_IRI_PATTERN.matcher(value).matches(); + } + + /** + * Checks if specified string is URN + * @param value value to check + * @return true if value is URN + */ + public static boolean isUrn(String value) { + return URN_PATTERN.matcher(value).matches(); + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharOutputSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharOutputSink.java new file mode 100644 index 00000000..c766c5f4 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharOutputSink.java @@ -0,0 +1,199 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; + +import java.io.Closeable; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.Charset; + +/** + * Implementation of {@link io.github.sparqlanything.html.org.semarglproject.sink.CharSink}. Provides bridging to Java IO APIs + * ({@link Writer}, {@link OutputStream}, {@link File}). + */ +public final class CharOutputSink implements CharSink { + + private File file; + private Writer writer; + private OutputStream outputStream; + private boolean closeOnEndStream; + private final Charset charset; + + private static final short BATCH_SIZE = 256; + private StringBuilder buffer; + private short bufferSize; + + /** + * Creates class instance with default charset encoding.. + */ + public CharOutputSink() { + this(Charset.defaultCharset()); + } + + /** + * Creates class instance with specified charset encoding. + * @param charset charset + */ + public CharOutputSink(Charset charset) { + this.charset = charset; + } + + /** + * Creates class instance with specified charset name. + * @param charsetName charset name + */ + public CharOutputSink(String charsetName) { + this.charset = Charset.forName(charsetName); + } + + /** + * Redirects output to specified file + * @param file output file + */ + public void connect(File file) { + this.file = file; + this.writer = null; + this.outputStream = null; + this.closeOnEndStream = true; + } + + /** + * Redirects output to specified writer + * @param writer output writer + */ + public void connect(Writer writer) { + this.file = null; + this.writer = writer; + this.outputStream = null; + this.closeOnEndStream = false; + } + + /** + * Redirects output to specified stream + * @param outputStream output stream + */ + public void connect(OutputStream outputStream) { + this.file = null; + this.writer = null; + this.outputStream = outputStream; + this.closeOnEndStream = false; + } + + @Override + public CharOutputSink process(String str) throws ParseException { + buffer.append(str); + bufferSize += str.length(); + writeBuffer(); + return this; + } + + @Override + public CharOutputSink process(char ch) throws ParseException { + buffer.append(ch); + bufferSize++; + writeBuffer(); + return this; + } + + @Override + public CharOutputSink process(char[] buffer, int start, int count) throws ParseException { + this.buffer.append(buffer, start, count); + bufferSize += count; + writeBuffer(); + return this; + } + + private void writeBuffer() { + if (bufferSize >= BATCH_SIZE) { + try { + try { + writer.write(buffer.toString()); + } catch (IOException e) { + throw new ParseException(e); + } + } catch (ParseException e) { + // do nothing + } + buffer = new StringBuilder(BATCH_SIZE); + bufferSize = 0; + } + } + + @Override + public void setBaseUri(String baseUri) { + } + + @Override + public void startStream() throws ParseException { + buffer = new StringBuilder(); + bufferSize = 0; + if (writer == null) { + if (file != null) { + try { + writer = new OutputStreamWriter(new FileOutputStream(file), charset); + } catch (FileNotFoundException e) { + throw new ParseException(e); + } + } else if (outputStream != null) { + writer = new OutputStreamWriter(outputStream, charset); + } + } + } + + @Override + public void endStream() throws ParseException { + buffer.append("\n"); + bufferSize = BATCH_SIZE; + writeBuffer(); + try { + writer.flush(); + } catch (IOException e) { + throw new ParseException(e); + } + if (closeOnEndStream) { + if (writer != null) { + closeQuietly(writer); + writer = null; + } else if (outputStream != null) { + closeQuietly(outputStream); + outputStream = null; + } + } + } + + @Override + public boolean setProperty(String key, Object value) { + return false; + } + + private static void closeQuietly(Closeable closeable) { + try { + if (closeable != null) { + closeable.close(); + } + } catch (IOException ioe) { + // ignore + } + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharSink.java new file mode 100644 index 00000000..904ed5d8 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/CharSink.java @@ -0,0 +1,51 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; + +/** + * Interface for handling events from CharSource + */ +public interface CharSink extends DataSink { + + /** + * Callback for string processing + * + * @param str string for processing + * @throws ParseException + */ + CharSink process(String str) throws ParseException; + + /** + * Callback for char processing + * + * @param ch char for processing + * @throws ParseException + */ + CharSink process(char ch) throws ParseException; + + /** + * Callback for buffer processing + * + * @param buffer char buffer for processing + * @param start position to start + * @param count count of chars to process + * @throws ParseException + */ + CharSink process(char[] buffer, int start, int count) throws ParseException; +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/DataSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/DataSink.java new file mode 100644 index 00000000..475d5a53 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/DataSink.java @@ -0,0 +1,50 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; + +/** + * Base sink interface. + */ +public interface DataSink { + + /** + * Sets document base URI. Must be called befor start stream event. + * @param baseUri base URI + */ + void setBaseUri(String baseUri); + + /** + * Callback for start stream event. + * @throws ParseException + */ + void startStream() throws ParseException; + + /** + * Callback for end stream event. + * @throws ParseException + */ + void endStream() throws ParseException; + + /** + * Key-value based settings. Property settings are passed to child sinks. + * @param key property key + * @param value property value + * @return true if at least one sink understands specified property, false otherwise + */ + boolean setProperty(String key, Object value); +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/Pipe.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/Pipe.java new file mode 100644 index 00000000..6ab7ae6c --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/Pipe.java @@ -0,0 +1,54 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; + +/** + * Base class for pipeline procecessing blocks with one source and one sink. + * @param class of output sink + */ +public abstract class Pipe implements DataSink { + + protected final S sink; + + protected Pipe(S sink) { + this.sink = sink; + } + + @Override + public void startStream() throws ParseException { + sink.startStream(); + } + + @Override + public void endStream() throws ParseException { + sink.endStream(); + } + + @Override + public final boolean setProperty(String key, Object value) { + boolean sinkResult = false; + if (sink != null) { + sinkResult = sink.setProperty(key, value); + } + return setPropertyInternal(key, value) || sinkResult; + } + + protected abstract boolean setPropertyInternal(String key, Object value); + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/QuadSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/QuadSink.java new file mode 100644 index 00000000..4cf04161 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/QuadSink.java @@ -0,0 +1,55 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; + +/** + * Interface for quad consuming + */ +public interface QuadSink extends TripleSink { + + /** + * Callback for handling triples with non literal object + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param obj object's IRI or BNode name + * @param graph graph's IRI + */ + void addNonLiteral(String subj, String pred, String obj, String graph); + + /** + * Callback for handling triples with plain literal objects + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param content unescaped string representation of content + * @param lang content's lang, can be null if no language specified + * @param graph graph's IRI + */ + void addPlainLiteral(String subj, String pred, String content, String lang, String graph); + + /** + * Callback for handling triples with typed literal objects + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param content unescaped string representation of content + * @param type literal datatype's IRI + * @param graph graph's IRI + */ + void addTypedLiteral(String subj, String pred, String content, String type, String graph); + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/TripleSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/TripleSink.java new file mode 100644 index 00000000..73ca79e8 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/TripleSink.java @@ -0,0 +1,51 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; + +/** + * Interface for triple consuming. + */ +public interface TripleSink extends DataSink { + + /** + * Callback for handling triples with non literal object + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param obj object's IRI or BNode name + */ + void addNonLiteral(String subj, String pred, String obj); + + /** + * Callback for handling triples with plain literal objects + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param content unescaped string representation of content + * @param lang content's lang, can be null if no language specified + */ + void addPlainLiteral(String subj, String pred, String content, String lang); + + /** + * Callback for handling triples with typed literal objects + * @param subj subject's IRI or BNode name + * @param pred predicate's IRI + * @param content unescaped string representation of content + * @param type literal datatype's IRI + */ + void addTypedLiteral(String subj, String pred, String content, String type); + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/XmlSink.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/XmlSink.java new file mode 100644 index 00000000..f27e1827 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/sink/XmlSink.java @@ -0,0 +1,34 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.sink; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; + +/** + * Sink interface for streaming XML processors. + */ +public interface XmlSink extends DataSink, ContentHandler, LexicalHandler { + + /** + * Unwraps underlying ParseException from SAXException or + * wraps generic SAXException with ParseException. + */ + ParseException processException(SAXException e); +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/AbstractSource.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/AbstractSource.java new file mode 100644 index 00000000..3f304dd2 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/AbstractSource.java @@ -0,0 +1,36 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.source; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; + +import java.io.InputStream; +import java.io.Reader; + +abstract class AbstractSource { + + protected final S sink; + + protected AbstractSource(S sink) { + this.sink = sink; + } + + protected abstract void process(Reader reader, String mimeType, String baseUri) throws ParseException; + + protected abstract void process(InputStream inputStream, String mimeType, String baseUri) throws ParseException; + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/BaseStreamProcessor.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/BaseStreamProcessor.java new file mode 100644 index 00000000..6864e399 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/BaseStreamProcessor.java @@ -0,0 +1,200 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.source; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; +import io.github.sparqlanything.html.org.semarglproject.sink.XmlSink; +import io.github.sparqlanything.html.org.semarglproject.source.AbstractSource; +import io.github.sparqlanything.html.org.semarglproject.source.CharSource; +import io.github.sparqlanything.html.org.semarglproject.source.XmlSource; + +import java.io.Closeable; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; + +/** + * Pipeline managing class to subclass from. + */ +public abstract class BaseStreamProcessor { + + protected abstract void startStream() throws ParseException; + + protected abstract void endStream() throws ParseException; + + protected abstract void processInternal(Reader reader, String mimeType, String baseUri) throws ParseException; + + protected abstract void processInternal(InputStream inputStream, String mimeType, + String baseUri) throws ParseException; + + /** + * Key-value based settings. Property settings are passed to child sinks. + * @param key property key + * @param value property value + * @return true if at least one sink understands specified property, false otherwise + */ + public abstract boolean setProperty(String key, Object value); + + /** + * Processes specified document's file using file path as base URI + * @param file document's file + * @throws ParseException + */ + public final void process(File file) throws ParseException { + String baseUri = "file://" + file.getAbsolutePath(); + process(file, baseUri); + } + + /** + * Processes specified document's file + * @param file document's file + * @param baseUri document's URI + * @throws ParseException + */ + public final void process(File file, String baseUri) throws ParseException { + FileReader reader; + try { + reader = new FileReader(file); + } catch (FileNotFoundException e) { + throw new ParseException(e); + } + try { + process(reader, null, baseUri); + } finally { + closeQuietly(reader); + } + } + + /** + * Processes document pointed by specified URI + * @param uri document's URI + * @throws ParseException + */ + public final void process(String uri) throws ParseException { + process(uri, uri); + } + + /** + * Processes document pointed by specified URI. Uses specified URI as document's base. + * @param uri document's URI + * @param baseUri document's URI + * @throws ParseException + */ + public final void process(String uri, String baseUri) throws ParseException { + URL url; + try { + url = new URL(uri); + } catch (MalformedURLException e) { + throw new ParseException(e); + } + try { + URLConnection urlConnection = url.openConnection(); + String mimeType = urlConnection.getContentType(); + InputStream inputStream = urlConnection.getInputStream(); + try { + process(inputStream, mimeType, baseUri); + } finally { + closeQuietly(inputStream); + } + } catch (IOException e) { + throw new ParseException(e); + } + } + + /** + * Processes stream input for document + * @param inputStream document's input stream + * @param baseUri document's base URI + * @throws ParseException + */ + public void process(InputStream inputStream, String baseUri) throws ParseException { + process(inputStream, null, baseUri); + } + + /** + * Processes stream input for document + * @param inputStream document's input stream + * @param mimeType document's MIME type + * @param baseUri document's base URI + * @throws ParseException + */ + public final void process(InputStream inputStream, String mimeType, String baseUri) throws ParseException { + startStream(); + try { + processInternal(inputStream, mimeType, baseUri); + } finally { + endStream(); + } + } + + /** + * Processes reader input for document's + * @param reader document's reader + * @throws ParseException + */ + public void process(Reader reader, String baseUri) throws ParseException { + process(reader, null, baseUri); + } + + /** + * Processes reader input for document's + * @param reader document's reader + * @param mimeType document's MIME type + * @param baseUri document's base URI + * @throws ParseException + */ + public final void process(Reader reader, String mimeType, String baseUri) throws ParseException { + startStream(); + try { + processInternal(reader, mimeType, baseUri); + } finally { + endStream(); + } + } + + /** + * Creates source appropriate for specified sink. + * @param sink sink to create source for + * @return new instance of source which can stream to sink + */ + protected static io.github.sparqlanything.html.org.semarglproject.source.AbstractSource createSourceForSink(DataSink sink) { + if (sink instanceof CharSink) { + return new io.github.sparqlanything.html.org.semarglproject.source.CharSource((CharSink) sink); + } else if (sink instanceof XmlSink) { + return new io.github.sparqlanything.html.org.semarglproject.source.XmlSource((XmlSink) sink); + } + return null; + } + + static void closeQuietly(Closeable closeable) { + try { + if (closeable != null) { + closeable.close(); + } + } catch (IOException ioe) { + // ignore + } + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/CharSource.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/CharSource.java new file mode 100644 index 00000000..efc6392c --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/CharSource.java @@ -0,0 +1,63 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.source; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.CharSink; +import io.github.sparqlanything.html.org.semarglproject.source.AbstractSource; +import io.github.sparqlanything.html.org.semarglproject.source.BaseStreamProcessor; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; + +final class CharSource extends AbstractSource { + + CharSource(CharSink sink) { + super(sink); + } + + @Override + public void process(Reader reader, String mimeType, String baseUri) throws ParseException { + BufferedReader bufferedReader = new BufferedReader(reader); + try { + sink.setBaseUri(baseUri); + char[] buffer = new char[512]; + int read; + while ((read = bufferedReader.read(buffer)) != -1) { + sink.process(buffer, 0, read); + } + } catch (IOException e) { + throw new ParseException(e); + } finally { + BaseStreamProcessor.closeQuietly(bufferedReader); + } + } + + @Override + public void process(InputStream inputStream, String mimeType, String baseUri) throws ParseException { + Reader reader = new InputStreamReader(inputStream, Charset.forName("UTF-8")); + try { + process(reader, mimeType, baseUri); + } finally { + BaseStreamProcessor.closeQuietly(reader); + } + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/StreamProcessor.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/StreamProcessor.java new file mode 100644 index 00000000..08bb428b --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/StreamProcessor.java @@ -0,0 +1,109 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.source; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.DataSink; +import io.github.sparqlanything.html.org.semarglproject.source.AbstractSource; +import io.github.sparqlanything.html.org.semarglproject.source.BaseStreamProcessor; +import io.github.sparqlanything.html.org.semarglproject.source.XmlSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import java.io.InputStream; +import java.io.Reader; + +/** + * Simple pipeline managing wrapper. Automatically instantiates source appropriate for specified sink. + * Provides processing and setup methods. + *
+ * List of supported properties: + *
    + *
  • {@link #XML_READER_PROPERTY}
  • + *
+ */ +public final class StreamProcessor extends BaseStreamProcessor { + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Allows to specify custom {@link XMLReader} used with SAX parsers. + */ + public static final String XML_READER_PROPERTY = "http://semarglproject.org/core/properties/xml-parser"; + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Enables or disables error recovery mechanism. + */ + public static final String ENABLE_ERROR_RECOVERY = + "http://semarglproject.org/core/properties/enable-error-recovery"; + + /** + * Used as a key with {@link #setProperty(String, Object)} method. + * Allows to specify handler for processor events. + * Subclass of {@link io.github.sparqlanything.html.org.semarglproject.rdf.ProcessorGraphHandler} must be passed as a value. + */ + public static final String PROCESSOR_GRAPH_HANDLER_PROPERTY = + "http://semarglproject.org/core/properties/processor-graph-handler"; + + private final DataSink sink; + private final io.github.sparqlanything.html.org.semarglproject.source.AbstractSource source; + + /** + * Instantiates stream processor for pipe starting with specified sink. + * @param sink pipe's input + */ + public StreamProcessor(DataSink sink) { + this.sink = sink; + this.source = createSourceForSink(sink); + } + + @Override + public void processInternal(InputStream inputStream, String mimeType, String baseUri) throws ParseException { + source.process(inputStream, mimeType, baseUri); + } + + @Override + protected void startStream() throws ParseException { + sink.startStream(); + } + + @Override + protected void endStream() throws ParseException { + sink.endStream(); + } + + @Override + public void processInternal(Reader reader, String mimeType, String baseUri) throws ParseException { + source.process(reader, mimeType, baseUri); + } + + @Override + public boolean setProperty(String key, Object value) { + boolean result = false; + if (XML_READER_PROPERTY.equals(key) && value instanceof XMLReader && source instanceof io.github.sparqlanything.html.org.semarglproject.source.XmlSource) { + try { + if (value != null) { + ((XmlSource) source).setXmlReader((XMLReader) value); + result = true; + } + } catch(SAXException e) { + throw new IllegalArgumentException("XMLReader was not able to be initialized", e); + } + } + return sink.setProperty(key, value) || result; + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/XmlSource.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/XmlSource.java new file mode 100644 index 00000000..14cac2b4 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/source/XmlSource.java @@ -0,0 +1,95 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.source; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.sink.XmlSink; +import io.github.sparqlanything.html.org.semarglproject.source.AbstractSource; +import io.github.sparqlanything.html.org.semarglproject.source.BaseStreamProcessor; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.XMLReaderFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; + +final class XmlSource extends AbstractSource { + + private XMLReader xmlReader = null; + + XmlSource(XmlSink sink) { + super(sink); + } + + @Override + public void process(Reader reader, String mimeType, String baseUri) throws ParseException { + try { + initXmlReader(); + } catch (SAXException e) { + throw new ParseException("Can not instantinate XMLReader", e); + } + try { + sink.setBaseUri(baseUri); + xmlReader.parse(new InputSource(reader)); + } catch (SAXException e) { + ParseException wrappedException = sink.processException(e); + try { + sink.endDocument(); + } catch (SAXException e2) { + // do nothing + } + throw wrappedException; + } catch (IOException e) { + throw new ParseException(e); + } + } + + @Override + public void process(InputStream inputStream, String mimeType, String baseUri) throws ParseException { + Reader reader = new InputStreamReader(inputStream, Charset.forName("UTF-8")); + try { + process(reader, mimeType, baseUri); + } finally { + BaseStreamProcessor.closeQuietly(reader); + } + } + + private void initXmlReader() throws SAXException { + if (xmlReader == null) { + xmlReader = getDefaultXmlReader(); + } + xmlReader.setContentHandler(sink); + xmlReader.setProperty("http://xml.org/sax/properties/lexical-handler", sink); + } + + public void setXmlReader(XMLReader xmlReader) throws SAXException { + if(xmlReader == null) { + this.xmlReader = getDefaultXmlReader(); + } else { + this.xmlReader = xmlReader; + } + } + + public static XMLReader getDefaultXmlReader() throws SAXException { + XMLReader result = XMLReaderFactory.createXMLReader(); + result.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + return result; + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/OWL.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/OWL.java new file mode 100644 index 00000000..fcdef073 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/OWL.java @@ -0,0 +1,111 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.vocab; + +/** + * Defines URIs for the OWL vocabulary terms. + */ +public final class OWL { + public static final String NS = "http://www.w3.org/2002/07/owl#"; + + // OWL 2 RDF-Based Vocabulary + + public static final String ALL_DIFFERENT = NS + "AllDifferent"; + public static final String ALL_DISJOINT_CLASSES = NS + "AllDisjointClasses"; + public static final String ALL_DISJOINT_PROPERTIES = NS + "AllDisjointProperties"; + public static final String ALL_VALUES_FROM = NS + "allValuesFrom"; + public static final String ANNOTATED_PROPERTY = NS + "annotatedProperty"; + public static final String ANNOTATED_SOURCE = NS + "annotatedSource"; + public static final String ANNOTATED_TARGET = NS + "annotatedTarget"; + public static final String ANNOTATION = NS + "Annotation"; + public static final String ANNOTATION_PROPERTY = NS + "AnnotationProperty"; + public static final String ASSERTION_PROPERTY = NS + "assertionProperty"; + public static final String ASYMMETRIC_PROPERTY = NS + "AsymmetricProperty"; + public static final String AXIOM = NS + "Axiom"; + public static final String BACKWARD_COMPATIBLE_WITH = NS + "backwardCompatibleWith"; + public static final String BOTTOM_DATA_PROPERTY = NS + "bottomDataProperty"; + public static final String BOTTOM_OBJECT_PROPERTY = NS + "bottomObjectProperty"; + public static final String CARDINALITY = NS + "cardinality"; + public static final String CLASS = NS + "Class"; + public static final String COMPLEMENT_OF = NS + "complementOf"; + public static final String DATA_RANGE = NS + "DataRange"; + public static final String DATATYPE_COMPLEMENT_OF = NS + "datatypeComplementOf"; + public static final String DATATYPE_PROPERTY = NS + "DatatypeProperty"; + public static final String DEPRECATED = NS + "deprecated"; + public static final String DEPRECATED_CLASS = NS + "DeprecatedClass"; + public static final String DEPRECATED_PROPERTY = NS + "DeprecatedProperty"; + public static final String DIFFERENT_FROM = NS + "differentFrom"; + public static final String DISJOINT_UNION_OF = NS + "disjointUnionOf"; + public static final String DISJOINT_WITH = NS + "disjointWith"; + public static final String DISTINCT_MEMBERS = NS + "distinctMembers"; + public static final String EQUIVALENT_CLASS = NS + "equivalentClass"; + public static final String EQUIVALENT_PROPERTY = NS + "equivalentProperty"; + public static final String FUNCTIONAL_PROPERTY = NS + "FunctionalProperty"; + public static final String HAS_KEY = NS + "hasKey"; + public static final String HAS_SELF = NS + "hasSelf"; + public static final String HAS_VALUE = NS + "hasValue"; + public static final String IMPORTS = NS + "imports"; + public static final String INCOMPATIBLE_WITH = NS + "incompatibleWith"; + public static final String INTERSECTION_OF = NS + "intersectionOf"; + public static final String INVERSE_FUNCTIONAL_PROPERTY = NS + "InverseFunctionalProperty"; + public static final String INVERSE_OF = NS + "inverseOf"; + public static final String IRREFLEXIVE_PROPERTY = NS + "IrreflexiveProperty"; + public static final String MAX_CARDINALITY = NS + "maxCardinality"; + public static final String MAX_QUALIFIED_CARDINALITY = NS + "maxQualifiedCardinality"; + public static final String MEMBERS = NS + "members"; + public static final String MIN_CARDINALITY = NS + "minCardinality"; + public static final String MIN_QUALIFIED_CARDINALITY = NS + "minQualifiedCardinality"; + public static final String NAMED_INDIVIDUAL = NS + "NamedIndividual"; + public static final String NEGATIVE_PROPERTY_ASSERTION = NS + "NegativePropertyAssertion"; + public static final String NOTHING = NS + "Nothing"; + public static final String OBJECT_PROPERTY = NS + "ObjectProperty"; + public static final String ON_CLASS = NS + "onClass"; + public static final String ON_DATA_RANGE = NS + "onDataRange"; + public static final String ON_DATATYPE = NS + "onDatatype"; + public static final String ONE_OF = NS + "oneOf"; + public static final String ON_PROPERTY = NS + "onProperty"; + public static final String ON_PROPERTIES = NS + "onProperties"; + public static final String ONTOLOGY = NS + "Ontology"; + public static final String ONTOLOGY_PROPERTY = NS + "OntologyProperty"; + public static final String PRIOR_VERSION = NS + "priorVersion"; + public static final String PROPERTY_CHAIN_AXIOM = NS + "propertyChainAxiom"; + public static final String PROPERTY_DISJOINT_WITH = NS + "propertyDisjointWith"; + public static final String QUALIFIED_CARDINALITY = NS + "qualifiedCardinality"; + public static final String REFLEXIVE_PROPERTY = NS + "ReflexiveProperty"; + public static final String RESTRICTION = NS + "Restriction"; + public static final String SAME_AS = NS + "sameAs"; + public static final String SOME_VALUES_FROM = NS + "someValuesFrom"; + public static final String SOURCE_INDIVIDUAL = NS + "sourceIndividual"; + public static final String SYMMETRIC_PROPERTY = NS + "SymmetricProperty"; + public static final String TARGET_INDIVIDUAL = NS + "targetIndividual"; + public static final String TARGET_VALUE = NS + "targetValue"; + public static final String THING = NS + "Thing"; + public static final String TOP_DATA_PROPERTY = NS + "topDataProperty"; + public static final String TOP_OBJECT_PROPERTY = NS + "topObjectProperty"; + public static final String TRANSITIVE_PROPERTY = NS + "TransitiveProperty"; + public static final String UNION_OF = NS + "unionOf"; + public static final String VERSION_INFO = NS + "versionInfo"; + public static final String VERSION_IRI = NS + "versionIRI"; + public static final String WITH_RESTRICTIONS = NS + "withRestrictions"; + + // Datatypes of the OWL 2 RDF-Based Semantics + + public static final String RATIONAL = NS + "rational"; + public static final String REAL = NS + "real"; + + private OWL() { + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDF.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDF.java new file mode 100644 index 00000000..27aaee0a --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDF.java @@ -0,0 +1,82 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.vocab; + +/** + * Defines URIs for the RDF vocabulary terms and bnode constans used by framework. + */ +public final class RDF { + + public static final String BNODE_PREFIX = "_:"; + + // indicates that short bnode syntax shouldn't be used for this node + public static final String SHORTENABLE_BNODE_SUFFIX = "sbl"; + + public static final String NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + // Basic classes and properties + + public static final String PROPERTY = NS + "Property"; + public static final String XML_LITERAL = NS + "XMLLiteral"; + + public static final String TYPE = NS + "type"; + public static final String VALUE = NS + "value"; + + // Container and collection classes and properties + + public static final String ALT = NS + "Alt"; + public static final String BAG = NS + "Bag"; + public static final String SEQ = NS + "Seq"; + public static final String LIST = NS + "List"; + + public static final String FIRST = NS + "first"; + public static final String NIL = NS + "nil"; + public static final String REST = NS + "rest"; + + // Reification + + public static final String STATEMENT = NS + "Statement"; + + public static final String OBJECT = NS + "object"; + public static final String PREDICATE = NS + "predicate"; + public static final String SUBJECT = NS + "subject"; + + // Syntax names + + public static final String DESCRIPTION = NS + "Description"; + public static final String ID = NS + "ID"; + public static final String RDF = NS + "RDF"; + + public static final String ABOUT = NS + "about"; + public static final String DATATYPE = NS + "datatype"; + public static final String LI = NS + "li"; + public static final String NODEID = NS + "nodeID"; + public static final String PARSE_TYPE = NS + "parseType"; + public static final String RESOURCE = NS + "resource"; + + // Deprecated + + @Deprecated + public static final String ABOUT_EACH = NS + "aboutEach"; + @Deprecated + public static final String ABOUT_EACH_PREFIX = NS + "aboutEachPrefix"; + @Deprecated + public static final String BAG_ID = NS + "bagID"; + + private RDF() { + } + +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFS.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFS.java new file mode 100644 index 00000000..66784fc4 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFS.java @@ -0,0 +1,43 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.vocab; + +/** + * Defines URIs for the RDFS vocabulary terms. + */ +public final class RDFS { + + public static final String NS = "http://www.w3.org/2000/01/rdf-schema#"; + + public static final String DOMAIN = NS + "domain"; + public static final String RANGE = NS + "range"; + public static final String RESOURCE = NS + "Resource"; + public static final String LITERAL = NS + "Literal"; + public static final String DATATYPE = NS + "Datatype"; + public static final String CLASS = NS + "Class"; + public static final String SUB_CLASS_OF = NS + "subClassOf"; + public static final String SUB_PROPERTY_OF = NS + "subPropertyOf"; + public static final String MEMBER = NS + "member"; + public static final String CONTAINER = NS + "Container"; + public static final String CONTAINER_MEMBERSHIP_PROPERTY = NS + "ContainerMembershipProperty"; + public static final String COMMENT = NS + "comment"; + public static final String SEE_ALSO = NS + "seeAlso"; + public static final String IS_DEFINED_BY = NS + "isDefinedBy"; + public static final String LABEL = NS + "label"; + + private RDFS() { + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFa.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFa.java new file mode 100644 index 00000000..f3e384f7 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/RDFa.java @@ -0,0 +1,58 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.vocab; + +/** + * Defines URIs for the RDFa vocabulary terms and RDFa attributes and versions. + */ +public final class RDFa { + + public static final String NS = "http://www.w3.org/ns/rdfa#"; + + public static final short VERSION_10 = 1; + public static final short VERSION_11 = 2; + + public static final String ABOUT_ATTR = "about"; + public static final String CONTENT_ATTR = "content"; + public static final String DATATYPE_ATTR = "datatype"; + public static final String HREF_ATTR = "href"; + public static final String ID_ATTR = "id"; + public static final String INLIST_ATTR = "inlist"; + public static final String PREFIX_ATTR = "prefix"; + public static final String PROFILE_ATTR = "profile"; + public static final String PROPERTY_ATTR = "property"; + public static final String REL_ATTR = "rel"; + public static final String RESOURCE_ATTR = "resource"; + public static final String REV_ATTR = "rev"; + public static final String ROLE_ATTR = "role"; + public static final String SRC_ATTR = "src"; + public static final String TYPEOF_ATTR = "typeof"; + public static final String VOCAB_ATTR = "vocab"; + + public static final String CONTEXT = NS + "context"; + public static final String WARNING = NS + "Warning"; + public static final String PREFIX_REDEFINITION = NS + "PrefixRedefinition"; + public static final String UNRESOLVED_CURIE = NS + "UnresolvedCURIE"; + public static final String UNRESOLVED_TERM = NS + "UnresolvedTerm"; + public static final String ERROR = NS + "Error"; + public static final String USES_VOCABULARY = NS + "usesVocabulary"; + + public static final String COPY = NS + "copy"; + public static final String PATTERN = NS + "Pattern"; + + private RDFa() { + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/XSD.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/XSD.java new file mode 100644 index 00000000..12eb9e42 --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/vocab/XSD.java @@ -0,0 +1,72 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.vocab; + +/** + * Defines URIs for the XSD primitive datatypes. + */ +public final class XSD { + + public static final String NS = "http://www.w3.org/2001/XMLSchema#"; + + public static final String ANY_URI = NS + "anyURI"; + public static final String BASE64_BINARY = NS + "base64Binary"; + public static final String DATE = NS + "date"; + public static final String DATE_TIME = NS + "dateTime"; + public static final String DECIMAL = NS + "decimal"; + public static final String DURATION = NS + "duration"; + public static final String ENTITIES = NS + "ENTITIES"; + public static final String ENTITY = NS + "ENTITY"; + public static final String G_DAY = NS + "gDay"; + public static final String G_MONTH = NS + "gMonth"; + public static final String G_MONTH_DAY = NS + "gMonthDay"; + public static final String G_YEAR = NS + "gYear"; + public static final String G_YEAR_MONTH = NS + "gYearMonth"; + public static final String HEX_BINARY = NS + "hexBinary"; + public static final String ID = NS + "ID"; + public static final String IDREF = NS + "IDREF"; + public static final String IDREFS = NS + "IDREFS"; + public static final String INTEGER = NS + "integer"; + public static final String LANGUAGE = NS + "language"; + public static final String NAME = NS + "Name"; + public static final String NC_NAME = NS + "NCName"; + public static final String NEGATIVE_INTEGER = NS + "negativeInteger"; + public static final String NMTOKEN = NS + "NMTOKEN"; + public static final String NMTOKENS = NS + "NMTOKENS"; + public static final String NON_NEGATIVE_INTEGER = NS + "nonNegativeInteger"; + public static final String NON_POSITIVE_INTEGER = NS + "nonPositiveInteger"; + public static final String NORMALIZED_STRING = NS + "normalizedString"; + public static final String NOTATION = NS + "NOTATION"; + public static final String POSITIVE_INTEGER = NS + "positiveInteger"; + public static final String QNAME = NS + "QName"; + public static final String TIME = NS + "time"; + public static final String TOKEN = NS + "token"; + public static final String UNSIGNED_BYTE = NS + "unsignedByte"; + public static final String UNSIGNED_INT = NS + "unsignedInt"; + public static final String UNSIGNED_LONG = NS + "unsignedLong"; + public static final String UNSIGNED_SHORT = NS + "unsignedShort"; + public static final String BOOLEAN = NS + "boolean"; + public static final String BYTE = NS + "byte"; + public static final String DOUBLE = NS + "double"; + public static final String FLOAT = NS + "float"; + public static final String INT = NS + "int"; + public static final String LONG = NS + "long"; + public static final String SHORT = NS + "short"; + public static final String STRING = NS + "string"; + + private XSD() { + } +} diff --git a/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/xml/XmlUtils.java b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/xml/XmlUtils.java new file mode 100644 index 00000000..65e96a6b --- /dev/null +++ b/sparql-anything-html/src/main/java/io/github/sparqlanything/html/org/semarglproject/xml/XmlUtils.java @@ -0,0 +1,186 @@ +/** + * Copyright 2012-2013 the Semargl contributors. See AUTHORS for more details. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.sparqlanything.html.org.semarglproject.xml; + +import org.xml.sax.Attributes; + +import java.util.BitSet; +import java.util.Map; +import java.util.regex.Pattern; + +/** + * Lightweight XML utils for internal usage + */ +public final class XmlUtils { + + /** + * XML identifier start char checker + */ + public static final BitSet ID_START = new BitSet(); + + /** + * XML identifier char checker + */ + public static final BitSet ID = new BitSet(); + + /** + * XML whitespace char checker + */ + public static final BitSet WHITESPACE = new BitSet(); + + /** + * XML quote char checker + */ + public static final BitSet QUOTE = new BitSet(); + + /** + * XML greater char checker + */ + public static final BitSet GT = new BitSet(); + + /** + * XML right square bracket char checker + */ + public static final BitSet RIGHT_SQ_BRACKET = new BitSet(); + + /** + * XML lang attribute name + */ + public static final String XML_LANG = "xml:lang"; + + /** + * XML base attribute name + */ + public static final String XML_BASE = "xml:base"; + + /** + * Lang attribute name + */ + public static final String LANG = "lang"; + + private static final String NC_NAME_START_CHAR = "A-Za-z_\u00C0-\u00D6\u00D8-\u00F6" + + "\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F" + + "\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"; + // \u10000-\uEFFFF + private static final String NC_NAME_CHAR = "-.0-9\u00B7\u0300-\u036F\u203F-\u2040"; + private static final Pattern XML_NAME_PATTERN = Pattern.compile("[" + NC_NAME_START_CHAR + "]" + + "[" + NC_NAME_START_CHAR + NC_NAME_CHAR + "]*"); + + private static final String ID_START_STR = "ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; + private static final String ID_OTHER_STR = "-0123456789:"; + private static final String WHITESPACE_STR = " \t\r\n\f\u000B\u001C\u001D\u001E\u00A0\u2007\u202F"; + + private static final BitSet ID_START_OR_GT = new BitSet(); + private static final BitSet ID_START_OR_EQUAL = new BitSet(); + private static final BitSet ID_OR_QUOTE_OR_APOS = new BitSet(); + private static final BitSet APOS = new BitSet(); + private static final BitSet LT = new BitSet(); + + static { + LT.set('<'); + GT.set('>'); + APOS.set('\''); + QUOTE.set('\"'); + RIGHT_SQ_BRACKET.set(']'); + for (int i = 0; i < ID_START_STR.length(); i++) { + char c = ID_START_STR.charAt(i); + ID_START.set(c); + ID_START_OR_EQUAL.set(c); + ID_START_OR_GT.set(c); + ID.set(c); + ID_OR_QUOTE_OR_APOS.set(c); + } + for (int i = 0; i < ID_OTHER_STR.length(); i++) { + char c = ID_OTHER_STR.charAt(i); + ID.set(c); + ID_OR_QUOTE_OR_APOS.set(c); + } + ID_START_OR_GT.set('>'); + ID_START_OR_EQUAL.set('='); + ID_OR_QUOTE_OR_APOS.set('\''); + ID_OR_QUOTE_OR_APOS.set('\"'); + for (int i = 0; i < WHITESPACE_STR.length(); i++) { + char c = WHITESPACE_STR.charAt(i); + WHITESPACE.set(c); + } + } + + private XmlUtils() { + } + + + /** + * Checks if specified value is valid XML name + * @param value value to check + * @return true if value is valid XML name + */ + public static boolean isValidNCName(String value) { + return XML_NAME_PATTERN.matcher(value).matches(); + } + + /** + * Serializes node open tag + * @param nsUri node's NS URI + * @param qname node's QName + * @param nsMappings node's namespace mappings + * @param attrs node's attributes + * @param optimizeNs should unused namespaces be skipped + * @return string representation of open tag + */ + public static String serializeOpenTag(String nsUri, String qname, Map nsMappings, + Attributes attrs, boolean optimizeNs) { + String result = "<" + qname; + if (nsUri != null && nsUri.length() > 0) { + int idx = Math.max(qname.indexOf(':'), 0); + nsMappings.put(qname.substring(0, idx), nsUri); + } + for (int i = 0; i < attrs.getLength(); i++) { + result += " " + attrs.getQName(i) + "=\"" + attrs.getValue(i) + "\""; + } + for (String key : nsMappings.keySet()) { + if (optimizeNs && isPrefixIgnorable(key, qname, attrs)) { + continue; + } + + if (key.isEmpty()) { + String value = nsMappings.get(key); + result += " xmlns=\"" + value + "\""; + } else { + result += " xmlns:" + key + "=\"" + nsMappings.get(key) + "\""; + } + } + result += ">"; + return result; + } + + private static boolean isPrefixIgnorable(String key, String qname, Attributes attrs) { + boolean usagesFound = key.isEmpty() && qname.indexOf(':') == -1 || key.length() > 0 + && qname.startsWith(key + ":"); + for (int i = 0; i < attrs.getLength(); i++) { + String aqn = attrs.getQName(i); + if (aqn.startsWith("xml")) { + continue; + } + if (key.isEmpty() && aqn.indexOf(':') == -1 || key.length() > 0 + && aqn.startsWith(key + ":")) { + usagesFound = true; + break; + } + } + return !usagesFound; + } + +} diff --git a/sparql-anything-html/src/test/java/io/github/sparqlanything/html/RDFaSandbox.java b/sparql-anything-html/src/test/java/io/github/sparqlanything/html/RDFaSandbox.java new file mode 100644 index 00000000..9cfa8a2c --- /dev/null +++ b/sparql-anything-html/src/test/java/io/github/sparqlanything/html/RDFaSandbox.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2024 SPARQL Anything Contributors @ http://github.com/sparql-anything + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.sparqlanything.html; + +import io.github.sparqlanything.html.org.semarglproject.rdf.ParseException; +import io.github.sparqlanything.html.org.semarglproject.rdf.rdfa.RdfaParser; +import io.github.sparqlanything.html.org.semarglproject.sink.TripleSink; +import io.github.sparqlanything.html.org.semarglproject.source.StreamProcessor; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; + +public class RDFaSandbox { + private static final Logger logger = LoggerFactory.getLogger(TestRDFaParser.class); + @Test + public void RDFa() throws ParseException { + + TripleSink ts = new TripleSink() { + @Override + public void addNonLiteral(String subj, String pred, String obj) { + logger.trace("add non literal {} {} {}", subj, pred, obj); + } + + @Override + public void addPlainLiteral(String subj, String pred, String content, String lang) { + logger.trace("addPlainLiteral {} {} {} {}", subj, pred, content, lang); + } + + @Override + public void addTypedLiteral(String subj, String pred, String content, String type) { + logger.trace("addTypedLiteral {} {} {} {}", subj, pred, content, type); + } + + @Override + public void setBaseUri(String baseUri) { + logger.trace("setBaseURI {}", baseUri); + + } + + @Override + public void startStream() throws ParseException { + logger.trace("start stream"); + } + + @Override + public void endStream() throws ParseException { + logger.trace("end stream"); + } + + @Override + public boolean setProperty(String key, Object value) { + logger.trace("set property {} {}", key,value); + return false; + } + }; + + logger.trace("test logger"); + + StreamProcessor streamProcessor = new StreamProcessor(RdfaParser.connect(ts)); + streamProcessor.process(new File("/Users/lgu/workspace/SPARQLAnything/sparql.anything/sparql-anything-html/src/test/resources/RDFa.html")); + } +} diff --git a/sparql-anything-html/src/test/java/io/github/sparqlanything/html/TestRDFaParser.java b/sparql-anything-html/src/test/java/io/github/sparqlanything/html/TestRDFaParser.java new file mode 100644 index 00000000..3ff7a270 --- /dev/null +++ b/sparql-anything-html/src/test/java/io/github/sparqlanything/html/TestRDFaParser.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 SPARQL Anything Contributors @ http://github.com/sparql-anything + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.sparqlanything.html; + +import io.github.sparqlanything.testutils.AbstractTriplifierTester; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.Properties; + + +public class TestRDFaParser extends AbstractTriplifierTester { + + public TestRDFaParser() { + super(new HTMLTriplifier(), new Properties(), "html", "nq"); + this.printWholeGraph = true; + } + + @Ignore + @Test + public void testRDFa() { + this.assertResultIsIsomorphicWithExpected(); + } + + protected void properties(Properties properties) { + properties.setProperty(HTMLTriplifier.PROPERTY_METADATA.toString(), "true"); + } + + + + +} diff --git a/sparql-anything-html/src/test/resources/RDFa.html b/sparql-anything-html/src/test/resources/RDFa.html new file mode 100644 index 00000000..692d2df7 --- /dev/null +++ b/sparql-anything-html/src/test/resources/RDFa.html @@ -0,0 +1,32 @@ + + + + + + + +
+

+ Alice Birpemswick, + Email: alice@example.com, + Phone: +1 617.555.7332 +

+
+ + + + \ No newline at end of file diff --git a/sparql-anything-html/src/test/resources/RDFa.nq b/sparql-anything-html/src/test/resources/RDFa.nq new file mode 100644 index 00000000..07288c54 --- /dev/null +++ b/sparql-anything-html/src/test/resources/RDFa.nq @@ -0,0 +1,60 @@ + "Director: James Cameron (born August 16, 1954)" . + "Director: James Cameron (born August 16, 1954)" . + . + "Director: James Cameron (born August 16, 1954)" . + "

Avatar

Director: James Cameron (born August 16, 1954)" . + "Avatar Director: James Cameron (born August 16, 1954)" . + . + "" . + "https://schema.org/Movie" . + . + . + . + "\n\n
\n

Avatar

Director: James Cameron (born August 16, 1954)\n
\n" . + "Avatar Director: James Cameron (born August 16, 1954)" . + . + . + . +_:Bnode1hn5vtssbx2 . +_:Bnode1hn5vtssbx2 "Avatar" . + "Avatar" . + "Avatar" . + . + "name" . + "Avatar" . + . + _:Bnode1hn5vtssbx2 . + "
\n

Avatar

Director: James Cameron (born August 16, 1954)\n
" . + "Avatar Director: James Cameron (born August 16, 1954)" . + . + . + "Director: James Cameron (born August 16, 1954)" . + . + "Director: James Cameron (born August 16, 1954)" . + "Director: James Cameron (born August 16, 1954)" . + . + . + "https://schema.org/Movie" . + "" . + . + "Avatar Director: James Cameron (born August 16, 1954)" . + "

Avatar

Director: James Cameron (born August 16, 1954)" . + . + . + . + "Avatar Director: James Cameron (born August 16, 1954)" . + "\n\n
\n

Avatar

Director: James Cameron (born August 16, 1954)\n
\n" . + . +_:Bnode1hn5vtssbx2 "Avatar" . +_:Bnode1hn5vtssbx2 . + "Avatar" . + "name" . + . + "Avatar" . + "Avatar" . + _:Bnode1hn5vtssbx2 . + . + . + . + "Avatar Director: James Cameron (born August 16, 1954)" . + "
\n

Avatar

Director: James Cameron (born August 16, 1954)\n
" .