From: Jan-Pascal van Best Date: Wed, 29 Apr 2015 20:46:47 +0000 (+0200) Subject: tvgids.tv 95% done X-Git-Url: http://www.vanbest.org/gitweb/?a=commitdiff_plain;h=520d4bb1c8138ada7076e7d6c1532d84a09d8cbb;p=tv_grab_nl_java tvgids.tv 95% done --- diff --git a/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java b/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java index da7023b..be7ad24 100644 --- a/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java +++ b/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java @@ -70,6 +70,7 @@ public abstract class AbstractEPGSource implements EPGSource { } protected String fetchURL(URL url, String charset) throws Exception { + logger.trace(url); StringBuffer buf = new StringBuffer(); boolean done = false; for (int count = 0; !done; count++) { @@ -106,6 +107,20 @@ public abstract class AbstractEPGSource implements EPGSource { protected JSONObject fetchJSON(URL url) throws Exception { return fetchJSON(url, Charset.defaultCharset().name()); } + + protected org.jsoup.nodes.Document fetchJsoup(URL url, String charset) throws Exception { + String html = fetchURL(url, charset); + return org.jsoup.Jsoup.parse(html); + } + + protected org.jsoup.nodes.Document fetchJsoup(URL url) throws Exception { + return fetchJsoup(url, Charset.defaultCharset().name()); + } + + protected org.jsoup.nodes.Document fetchJsoup(String url) throws Exception { + return fetchJsoup(new URL(url)); + } + public void clearCache() { cache.clear(getName()); diff --git a/src/main/java/org/vanbest/xmltv/Config.java b/src/main/java/org/vanbest/xmltv/Config.java index efc027d..fcf8b64 100644 --- a/src/main/java/org/vanbest/xmltv/Config.java +++ b/src/main/java/org/vanbest/xmltv/Config.java @@ -131,26 +131,30 @@ public class Config { static private Map getDefaultCattrans() { Map result = new HashMap(); result.put("amusement", "Unknown"); - result.put("comedy", "Comedy"); - result.put("documentaire", "Documentary"); - result.put("educatief", "Educational"); + result.put("overige", "Misc"); result.put("erotiek", "Adult"); - result.put("film", "Movie"); result.put("muziek", "Art/Music"); - result.put("informatief", "Educational"); - result.put("jeugd", "Children"); + result.put("theater", "Art/Music"); result.put("kunst/cultuur", "Art/Music"); + result.put("muziek", "Art/Music"); + result.put("jeugd", "Children"); + result.put("comedy", "Comedy"); result.put("misdaad", "Crime/Mystery"); - result.put("muziek", "Music"); - result.put("natuur", "Science/Nature"); + result.put("documentaire", "Documentary"); + result.put("Documentair portret", "Documentary"); + result.put("serie/soap", "Drama"); + result.put("educatief", "Educational"); + result.put("informatief", "Educational"); + result.put("film", "Movie"); result.put("nieuws/actualiteiten", "News"); - result.put("overige", "Unknown"); + result.put("news", "News"); + result.put("Nieuwsuitzending", "News"); + result.put("natuur", "Science/Nature"); + result.put("Wetenschappelijk magazine", "Science/Nature"); result.put("religieus", "Religion"); - result.put("serie/soap", "Drama"); - result.put("sport", "Sports"); - result.put("theater", "Art/Music"); result.put("wetenschap", "Science/Nature"); - result.put("news", "News"); + result.put("sport", "Sports"); + result.put("Magazine", "Talk"); return result; } diff --git a/src/main/java/org/vanbest/xmltv/TvGidsTv.java b/src/main/java/org/vanbest/xmltv/TvGidsTv.java index 04d36c8..92c1518 100644 --- a/src/main/java/org/vanbest/xmltv/TvGidsTv.java +++ b/src/main/java/org/vanbest/xmltv/TvGidsTv.java @@ -124,15 +124,15 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { Document doc; try { - doc = Jsoup.connect(CHANNEL_BASE_URL).get(); - } catch (IOException e) { + doc = fetchJsoup(CHANNEL_BASE_URL); + } catch (Exception e) { logger.error("Exception reading tvgids.tv channel list", e); return result; } Elements links = doc.select("div.channels a[href^=/zenders/]"); for (Element link: links) { - logger.debug(link.toString()); + //logger.debug(link.toString()); String name = link.select("div.channel-name").text(); String url = link.attr("href"); String id = url.replace("/zenders/", ""); @@ -172,8 +172,8 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { Document doc; try { logger.debug("Programme url: " + programmeUrl(c, day)); - doc = Jsoup.connect(programmeUrl(c, day)).get(); - } catch (IOException e) { + doc = fetchJsoup(programmeUrl(c, day)); + } catch (Exception e) { logger.error("Exception reading tvgids.tv programme list for " + c.defaultName() + " @" + day, e); return result; } @@ -213,6 +213,7 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { // We're supposing that the programmes are time-ordered here cal.add(Calendar.DAY_OF_MONTH, 1); } + cal.add(Calendar.DAY_OF_MONTH, day); Programme p = cache.get(getName(), programmeId); boolean cached = (p != null); @@ -437,20 +438,15 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { } } */ + static private final Pattern timePattern = Pattern.compile("([0-9]+):([0-9]+).*?([0-9]+):([0-9]+)"); + private void fillDetails(String detailUrl, Programme result) throws Exception { - Pattern progInfoPattern = Pattern.compile( - "prog-info-content.*prog-info-footer", Pattern.DOTALL); - Pattern infoLinePattern = Pattern - .compile("
  • (.*?):(.*?)
  • "); - Pattern HDPattern = Pattern.compile("HD \\d+[ip]?"); - Pattern kijkwijzerPattern = Pattern - .compile("\"(.*?)\""); - + Document doc; try { - doc = Jsoup.connect(detailUrl).get(); - } catch (IOException e) { + doc = fetchJsoup(detailUrl); + } catch (Exception e) { logger.error("Exception reading tvgids.tv detail for programme " + detailUrl, e); return; } @@ -463,23 +459,55 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { //logger.debug(" > " + next.nodeName() + ": " + next.text()); String key = element.text().toLowerCase(); String value = next.text(); + logger.trace(" " + key + ": " + value); if (key.equals("datum")) { - + // ignored, already present } else if (key.equals("tijd")) { - + //logger.trace("Tijd veld: \"" + value + "\""); + Matcher m = timePattern.matcher(value); + if (m.find() && m.groupCount()>=4 ) { + try { + //logger.trace(m.group(1) + ":" + m.group(2) + " en dan " + m.group(3) + ":" + m.group(4)); + int hourFrom = Integer.parseInt(m.group(1)); + int minuteFrom = Integer.parseInt(m.group(2)); + int hourTo = Integer.parseInt(m.group(3)); + int minuteTo = Integer.parseInt(m.group(4)); + Calendar cal = Calendar.getInstance(Locale.forLanguageTag("nl-NL")); + cal.setTime(result.startTime); + cal.add(Calendar.HOUR_OF_DAY, hourTo - hourFrom); + cal.add(Calendar.MINUTE, minuteTo - minuteFrom); + if (hourTo < hourFrom) { + cal.add(Calendar.HOUR_OF_DAY, 24); + } + result.endTime = cal.getTime(); + } catch (NumberFormatException e) { + logger.warn("Illegal tijd field \"" + value + "\""); + } + } } else if (key.equals("genre")) { - + //String category = config.translateCategory(value); + //if(category.equals(value)) { + // logger.warn("Untranslated genre: \"" + value + "\""); + //} + //result.addCategory(config.translateCategory(value)); + result.addCategory(value); } else if (key.equals("deel-url")) { result.addUrl(value); - logger.trace(element.toString()); - logger.trace(next.toString()); + //logger.trace(element.toString()); + //logger.trace(next.toString()); } else if (key.equals("presentatie")) { String[] presenters = value.split(","); for(String presenter: presenters) { result.addPresenter(presenter.trim()); } } else if (key.equals("jaar")) { - + //logger.trace(element.toString()); + //logger.trace(next.toString()); + try { + result.year = Integer.parseInt(value); + } catch (NumberFormatException e) { + logger.warn("Illegal year format \"" + value + "\""); + } } else if (key.equals("acteurs")) { String[] actors = value.split(","); for(String actor: actors) { @@ -488,11 +516,14 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { } else if (key.equals("regisseur")) { result.addDirector(value); } else if (key.equals("officiële website")) { - result.addUrl(value); + result.addUrl(next.select("a[href]").attr("href")); + //logger.trace(element.toString()); + //logger.trace(next.toString()); + //logger.trace(" URL: " + next.select("a[href]").attr("href")); } else if (key.equals("twitter hashtag")) { - + // ignore newfangled twitter thingie } else if (key.equals("officiële twitter")) { - + // ignore } else if (key.equals("uitzending gemist")) { //logger.debug("Uitzending gemist: \"" + value + "\""); //logger.trace(element.toString()); @@ -500,17 +531,68 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { //logger.debug(" gemist URL: " + next.select("a[href]").attr("href")); result.addUrl(next.select("a[href]").attr("href")); } else if (key.equals("imdb")) { - logger.trace(element.toString()); - logger.trace(next.toString()); + //logger.trace(element.toString()); + //logger.trace(next.toString()); + // e.g. "width: 73%" + String ratingString = next.select(".stars .bar").attr("style"); + Pattern widthPattern = Pattern.compile("(\\d+)%"); + Matcher m = widthPattern.matcher(ratingString); + if (m.find() && m.groupCount()>=1 ) { + try { + int percentage = Integer.parseInt(m.group(1)); + result.addStarRating(percentage, 100); + } catch (NumberFormatException e) { + logger.warn("Illegal imdb percentage: \"" + m.group(1) + "\""); + logger.debug(next.toString()); + } + } + // Add IMDB url + result.addUrl(next.select("a[href]").attr("href")); + } else if (key.equals("kijkwijzer")) { + //logger.trace(element.toString()); + //logger.trace(next.toString()); + + List list = new ArrayList(); + for(Element icon: next.select(".kijkwijzer-icon")) + { + for(String c: icon.classNames()) { + //logger.debug("Looking at \"" + c + "\""); + if (c.startsWith("kijkwijzer-")) { + c = c.replace("kijkwijzer-", ""); + if (c.equals("icon")) continue; + //logger.debug("Looking at \"" + c + "\""); + list.add(c); + } + } + } + if (config.joinKijkwijzerRatings) { + // mythtv doesn't understand multiple tags + result.addRating("kijkwijzer", StringUtils.join(list, ",")); + } else { + for (String rating : list) { + result.addRating("kijkwijzer", rating); + } + } } else { logger.warn("Unknown details element \"" + key + "\": \"" + value + "\""); + logger.trace(element.toString()); + logger.trace(next.toString()); } } Elements descElements = doc.select(".section-item p"); + result.addDescription(descElements.text()); //logger.debug("Description: " + descElements.text() ); /* + Pattern progInfoPattern = Pattern.compile( + "prog-info-content.*prog-info-footer", Pattern.DOTALL); + Pattern infoLinePattern = Pattern + .compile("
  • (.*?):(.*?)
  • "); + Pattern HDPattern = Pattern.compile("HD \\d+[ip]?"); + Pattern kijkwijzerPattern = Pattern + .compile("\"(.*?)\""); + URL url = HTMLDetailUrl(id); String clob = fetchURL(url); Matcher m = progInfoPattern.matcher(clob); @@ -580,13 +662,13 @@ public class TvGidsTv extends AbstractEPGSource implements EPGSource { writer.writeDTD(""); writer.writeCharacters("\n"); writer.writeStartElement("tv"); - // List my_channels = channels; - List my_channels = channels.subList(0, 2); + List my_channels = channels; + //List my_channels = channels.subList(0, 15); for (Channel c : channels) { c.serialize(writer, true); } writer.flush(); - List programmes = gids.getProgrammes(my_channels, 2); + List programmes = gids.getProgrammes(my_channels, 1); for (Programme p : programmes) { p.serialize(writer); }