Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve event page parsing #9

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 82 additions & 94 deletions src/main/java/org/parisjug/eventpublisher/eventpage/HtmlEventPage.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.parisjug.eventpublisher.eventpage;

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.time.format.DateTimeFormatter.ISO_INSTANT;

import java.io.File;
import java.io.IOException;
import java.net.URLEncoder;
Expand All @@ -10,19 +13,22 @@

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HtmlEventPage implements EventPage {
private static final String ABSOLUTE_URL = "https://www.parisjug.org";

Document doc;

@Override
public String getTitle() {
Elements titleElements = doc.select(".post__title");
if (titleElements.isEmpty()) {
Element titleElement = this.doc.selectFirst(".post__title");
if (titleElement == null) {
throw new EventPageCheckException(
"The page should contain an element with the id \"title\". For instance: <div id=\"title\">Quarkus World Tour</div>.");
}
return titleElements.first().text();
return titleElement.text();
}

protected void loadFromLocalHtmlFile(File htmlFile) {
Expand All @@ -43,70 +49,58 @@ protected void loadFromUrl(String url) {

@Override
public String getDetails() {
return getPart1() + getBuffet() + getPart2();
return convertToAbsoluteLinks(getPart1() + getBuffet() + getPart2());
}

public String getBuffet() {
Elements buffet = doc.select("#buffet");
if (buffet.isEmpty()) {
Elements detailh3 = doc.select("#détails");
if(detailh3.isEmpty()) {
return "";
}
String buffethtml = "";
Elements elements = detailh3.parents().first().children();
// for each element, in elements.stream() start at h3 with id contains buffet and append html until next h3
boolean start = false;
for(int i = 0; i < elements.size(); i++) {
if(elements.get(i).tagName().equals("h3") && elements.get(i).id().contains("buffet")) {
start = true;
}
if(elements.get(i).tagName().equals("h3") && !elements.get(i).id().contains("buffet")) {
start = false;
continue;
}
if(start) {
buffethtml += elements.get(i).outerHtml();
}
}
return buffethtml.replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
Element buffet = doc.selectFirst("#buffet");
if (buffet != null) {
return buffet.html();
}
return buffet.first().html().replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
Element buffetElement = doc.selectFirst("h3[id*='buffet']");
if (buffetElement == null) {
return "";
}
String buffetHtml = buffetElement.outerHtml();
while ((buffetElement = buffetElement.nextElementSibling()) != null &&
!"h3".equals(buffetElement.tagName())) {
buffetHtml+= buffetElement.outerHtml();
}
return buffetHtml;
}

@Override
public String getPart1() {
Elements part1 = doc.select("#part1");
if(part1.isEmpty()) {
Elements detailh3 = doc.select("#détails");
if(detailh3.isEmpty()) {
return "";
Element part1 = doc.selectFirst("#part1");
if (part1 != null) {
return part1.html();
}
Elements detailh3 = doc.select("#détails");
if(detailh3.isEmpty()) {
return "";
}
String part1html = "";
Elements elements = detailh3.parents().first().children();
// for each element, in elements.stream() start at h3 with id détail and append html until next h3
boolean start = false;
for(int i = 0; i < elements.size(); i++) {
if(elements.get(i).tagName().equals("h2") && elements.get(i).id().equals("détails")) {
start = true;
continue;
}
String part1html = "";
Elements elements = detailh3.parents().first().children();
// for each element, in elements.stream() start at h3 with id détail and append html until next h3
boolean start = false;
for(int i = 0; i < elements.size(); i++) {
if(elements.get(i).tagName().equals("h2") && elements.get(i).id().equals("détails")) {
start = true;
continue;
}
if(elements.get(i).tagName().equals("h2") && !elements.get(i).id().equals("détails")) {
start = false;
continue;
}
if(elements.get(i).tagName().equals("h3") && elements.get(i).id().contains("buffet")) {
start = false;
continue;
}
if(start) {
part1html += elements.get(i).outerHtml();
}
if(elements.get(i).tagName().equals("h2") && !elements.get(i).id().equals("détails")) {
start = false;
continue;
}
if(elements.get(i).tagName().equals("h3") && elements.get(i).id().contains("buffet")) {
start = false;
continue;
}
if(start) {
part1html += elements.get(i).outerHtml();
}
return part1html.replaceAll("href=\"/", "href=\"https://www.parisjug.org/");

}
return part1.first().html().replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
return part1html;
}

@Override
Expand Down Expand Up @@ -138,30 +132,25 @@ public String getPart2() {
part2html += elements.get(i).outerHtml();
}
}
return part2html.replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
return part2html;
}
return part2.first().html().replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
return part2.first().html();
}

@Override
public String getDateTime() {
Elements dateTimeElement = doc.select("#datetime");
if (dateTimeElement.isEmpty()) {
// in the section starting with h2 id="date-et-lieu", get the first ul li element
Elements elements = doc.select("#date-et-lieu").parents().first().children();
for(int i = 0; i < elements.size(); i++) {
if(elements.get(i).tagName().equals("ul")) {
Elements lis = elements.get(i).children();
for(int j = 0; j < lis.size(); j++) {
if(lis.get(j).tagName().equals("li")) {
return lis.get(j).text();
}
}
}
}

Element dateTimeElement = doc.selectFirst("#datetime");
if (dateTimeElement != null) {
return dateTimeElement.text();
}
// in the section starting with h2 id="date-et-lieu", get the first ul li element
Element dateEtLieuElement = doc.selectFirst("#date-et-lieu + ul > li");
if (dateEtLieuElement == null) {
throw new EventPageCheckException(
"The page should contain an element with the id \"date-et-lieu\"."
);
}
return dateTimeElement.first().text();
return dateEtLieuElement.text();
}

@Override
Expand All @@ -170,7 +159,7 @@ public String getStartTime() {
if (isVirtual()) {
eventDateTime = eventDateTime.minusMinutes(15);
}
return eventDateTime.format(DateTimeFormatter.ISO_INSTANT).replace(":", "").replace("-", "");
return eventDateTime.format(ISO_INSTANT).replace(":", "").replace("-", "");
}

private ZonedDateTime getEventZonedDateTime() {
Expand All @@ -185,7 +174,7 @@ public String getEndTime() {
} else {
eventDateTime = eventDateTime.plusMinutes(180);
}
return eventDateTime.format(DateTimeFormatter.ISO_INSTANT).replace(":", "").replace("-", "");
return eventDateTime.format(ISO_INSTANT).replace(":", "").replace("-", "");
}

ZonedDateTime parseDateTime(String datetimeInput) {
Expand All @@ -204,23 +193,19 @@ public String getLongTitle() {

@Override
public String getLocation() {
Elements locationElement = doc.select("#location a");
if (locationElement.isEmpty()) {
Element locationElement = doc.selectFirst("#location a");
if (locationElement == null) {
// in the section starting with h2 id="date-et-lieu", get the second li element
Elements elements = doc.select("#date-et-lieu").parents().first().getElementsByTag("li");
if(elements.size() > 1) {
locationElement = elements.get(1).getElementsByTag("a");
}
else {
return "";
}
locationElement = doc.selectFirst("#date-et-lieu + ul > li:eq(1) > a");
}

String attr = locationElement.first().attr("href");
if (attr.startsWith("/")) {
attr = "https://www.parisjug.org" + attr;
if (locationElement == null) {
return "";
}
String location = locationElement.attr("href");
if (location.startsWith("/")) {
location = ABSOLUTE_URL + location;
}
return attr;
return location;
}

@Override
Expand All @@ -234,21 +219,24 @@ public String generateGcalLink() {
}

String encode(String str) {
return URLEncoder.encode(str, StandardCharsets.UTF_8);
return URLEncoder.encode(str, UTF_8);
}

@Override
public String getIntro() {
Elements intro = doc.select("#intro");
if (intro.isEmpty()) {
Element intro = this.doc.selectFirst("#intro");
if (intro == null) {
return "";
}
return intro.first().html().replaceAll("href=\"/", "href=\"https://www.parisjug.org/");
return convertToAbsoluteLinks(intro.html());
}

@Override
public boolean isVirtual() {
return getTitle().contains("Soirée Virtuelle");
}

private static String convertToAbsoluteLinks(String html) {
return html.replaceAll("href=\"/", "href=\"" + ABSOLUTE_URL + "/");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public void test_inRealLifeEventPage() {
// title
assertEquals("Soirée Panama (GL)", page.getTitle(), "Title from page");

// assert page.getPart1 should contains "19h30 : Panama - Foreign Function &
// assert page.getPart1 should contain "19h30 : Panama - Foreign Function &
// Memory"
assertTrue(page.getPart1().contains("19h30 : Panama - Foreign Function"),
"part1 should contains 19h30 : Panama - Foreign Function");
Expand Down
Loading