From: Jan-Pascal van Best Date: Sat, 25 Apr 2015 19:35:05 +0000 (+0200) Subject: First steps for new EPG source tvgids.tv, which gives more days forecast for the... X-Git-Url: http://www.vanbest.org/gitweb/?a=commitdiff_plain;h=cddd441edabb03a71457343435bc86d0c707bf20;p=tv_grab_nl_java First steps for new EPG source tvgids.tv, which gives more days forecast for the Dutch NPO channels --- diff --git a/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java b/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java index 4684ea7..da7023b 100644 --- a/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java +++ b/src/main/java/org/vanbest/xmltv/AbstractEPGSource.java @@ -19,6 +19,7 @@ The full license text can be found in the LICENSE file. import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; @@ -59,6 +60,11 @@ public abstract class AbstractEPGSource implements EPGSource { cache.close(); } + protected String fetchURL(String s) throws Exception { + URL url = new URL(s); + return fetchURL(url); + } + protected String fetchURL(URL url) throws Exception { return fetchURL(url, Charset.defaultCharset().name()); } diff --git a/src/main/java/org/vanbest/xmltv/Channel.java b/src/main/java/org/vanbest/xmltv/Channel.java index 67e3075..fc9d418 100644 --- a/src/main/java/org/vanbest/xmltv/Channel.java +++ b/src/main/java/org/vanbest/xmltv/Channel.java @@ -38,8 +38,8 @@ public class Channel { // Use default xmltvid with id+"."+sourceName static Channel getChannel(String source, String id, String name) { - String xmltv = id + "." + source; - Channel c = new Channel(source, id, xmltv); + String xmltv = id + "." + source; + Channel c = new Channel(source, id, xmltv); c.names.add(name); return c; } diff --git a/src/main/java/org/vanbest/xmltv/TvGids.java b/src/main/java/org/vanbest/xmltv/TvGids.java index b8d5f71..91912d5 100644 --- a/src/main/java/org/vanbest/xmltv/TvGids.java +++ b/src/main/java/org/vanbest/xmltv/TvGids.java @@ -139,10 +139,9 @@ public class TvGids extends AbstractEPGSource implements EPGSource { int id = zender.getInt("id"); String name = org.apache.commons.lang.StringEscapeUtils .unescapeHtml(zender.getString("name")); - String icon = "http://tvgidsassets.nl/img/channels/53x27/" + id - + ".png"; + String icon = "http://tvgidsassets.nl/img/channels/53x27/" + id + ".png"; Channel c = Channel.getChannel(getName(), Integer.toString(id), name); - c.addIcon(icon); + c.addIcon(icon); result.add(c); } diff --git a/src/main/java/org/vanbest/xmltv/TvGidsTv.java b/src/main/java/org/vanbest/xmltv/TvGidsTv.java new file mode 100644 index 0000000..04d36c8 --- /dev/null +++ b/src/main/java/org/vanbest/xmltv/TvGidsTv.java @@ -0,0 +1,611 @@ +package org.vanbest.xmltv; + +/* + Copyright (c) 2012-2015 Jan-Pascal van Best + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + The full license text can be found in the LICENSE file. + */ + +import java.io.BufferedReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamWriter; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import net.sf.json.JSON; +import net.sf.json.JSONArray; +import net.sf.json.JSONObject; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +public class TvGidsTv extends AbstractEPGSource implements EPGSource { + + static String BASE_URL = "http://www.tvgids.tv"; + static String CHANNEL_BASE_URL = BASE_URL + "/zenders"; + static String DETAIL_BASE_URL = BASE_URL + "/tv"; + + private static final int MAX_PROGRAMMES_PER_DAY = 9999; + private static final int MAX_DAYS_AHEAD_SUPPORTED_BY_TVGIDS = 3; + public static final String NAME="tvgids.tv"; + + static Logger logger = Logger.getLogger(TvGids.class); + + public TvGidsTv(Config config) { + super(config); + } + + public String getName() { + return NAME; + } + + public static String programmeUrl(Channel channel, int day) + throws Exception + { + return CHANNEL_BASE_URL + "/" + channel.id + "/" + day; + } + + /* + public static URL programmeUrl(List channels, int day) + throws Exception { + StringBuilder s = new StringBuilder(programme_base_url); + if (channels.size() < 1) { + throw new Exception("should have at least one channel"); + } + s.append("?channels="); + boolean first = true; + for (Channel i : channels) { + if (first) { + s.append(i.id); + first = false; + } else { + s.append("," + i.id); + } + } + s.append("&day="); + s.append(day); + + return new URL(s.toString()); + } + + public static URL JSONDetailUrl(String id) throws Exception { + StringBuilder s = new StringBuilder(detail_base_url); + s.append("?id="); + s.append(id); + return new URL(s.toString()); + } + + public static URL HTMLDetailUrl(String id) throws Exception { + StringBuilder s = new StringBuilder(html_detail_base_url); + s.append(id); + s.append("/"); + return new URL(s.toString()); + } + + /* + * (non-Javadoc) + * + * @see org.vanbest.xmltv.EPGSource#getChannels() + */ + @Override + public List getChannels() { + List result = new ArrayList(10); + + Document doc; + try { + doc = Jsoup.connect(CHANNEL_BASE_URL).get(); + } catch (IOException e) { + logger.error("Exception reading tvgids.tv channel list", e); + return result; + } + + Elements links = doc.select("div.channels a[href^=/zenders/]"); + for (Element link: links) { + logger.debug(link.toString()); + String name = link.select("div.channel-name").text(); + String url = link.attr("href"); + String id = url.replace("/zenders/", ""); + Element iconElement = link.select("div.channel-icon").first(); + String iconUrl = null; + if (iconElement != null) { + Set classNames = iconElement.classNames(); + for(String s: classNames) { + if (s.startsWith("sprite-channel")) { + String sprite = s.replace("sprite-channel-", ""); + iconUrl = "http://images.cdn.tvgids.tv/channels/channel_" + sprite + "_BIG@2x.png"; + } + } + } + if (id != null) { + Channel c = Channel.getChannel(getName(), id, name); + if (iconUrl != null) c.addIcon(iconUrl); + result.add(c); + } + } + + return result; + } + + /* + * (non-Javadoc) + * + * @see org.vanbest.xmltv.EPGSource#getProgrammes(java.util.List, int, + * boolean) + */ + @Override + public List getProgrammes(List channels, int day) + throws Exception { + List result = new ArrayList(); + + for (Channel c : channels) { + Document doc; + try { + logger.debug("Programme url: " + programmeUrl(c, day)); + doc = Jsoup.connect(programmeUrl(c, day)).get(); + } catch (IOException e) { + logger.error("Exception reading tvgids.tv programme list for " + c.defaultName() + " @" + day, e); + return result; + } + + Elements links = doc.select("a.section-item"); + boolean afternoon = false; + for (Element link: links) { + // logger.debug(link.toString()); + String detailUrl = BASE_URL + link.attr("href"); + String programmeId = link.attr("href").replace("/tv/", ""); + String timeTitle = link.select(".section-item-title").text(); + String[] parts = timeTitle.trim().split(" ", 2); + String time = parts[0]; + String title = parts[1]; + if (parts.length!=2) { + logger.error("Programme time/title weird: \"" + timeTitle + "\""); + continue; + } + Calendar cal = Calendar.getInstance(Locale.forLanguageTag("nl-NL")); + //SimpleDateFormat sdf = new SimpleDateFormat("HH:mm"); + cal.setTimeZone(TimeZone.getTimeZone("Europe/Amsterdam")); + //Date date = sdf.parse(time); + String[] time_parts = time.split(":", 2); + if (time_parts.length!=2) { + logger.error("Programme time weird: \"" + timeTitle + "\""); + continue; + } + int hour = Integer.parseInt(time_parts[0]); + int minute = Integer.parseInt(time_parts[1]); + cal.set(Calendar.HOUR_OF_DAY, hour); + cal.set(Calendar.MINUTE, minute); + cal.set(Calendar.SECOND, 0); + cal.set(Calendar.MILLISECOND, 0); + if (hour >= 15) afternoon = true; + if (hour < 11 && afternoon) { + // We've rolled into the night, so it's the next day + // We're supposing that the programmes are time-ordered here + cal.add(Calendar.DAY_OF_MONTH, 1); + } + + Programme p = cache.get(getName(), programmeId); + boolean cached = (p != null); + if (p == null) { + stats.cacheMisses++; + p = new Programme(); + p.channel = c.getXmltvChannelId(); + // Do this here, because we can only add to these fields. Pity if + // they're updated + p.addTitle(title); + } else { + // System.out.println("From cache: " + + // programme.getString("titel")); + stats.cacheHits++; + } + p.startTime = cal.getTime(); + + //logger.trace(" Programme \"" + title + "\" at " + time + " (" + cal.getTime().toString() + "); details " + detailUrl); + + if (config.fetchDetails && !cached) { + // TODO also read details if those have not been cached + fillDetails(detailUrl, p); + } + if (!cached) { + // FIXME where to do this? + cache.put(getName(), programmeId, p); + } + logger.debug(p.toString()); + result.add(p); + } + } + + /* + if (day > MAX_DAYS_AHEAD_SUPPORTED_BY_TVGIDS) { + return result; // empty list + } + + URL url = programmeUrl(channels, day);t + + JSONObject jsonObject = fetchJSON(url); + + for (Channel c : channels) { + JSON ps = (JSON) jsonObject.get(c.id); + if (ps.isArray()) { + JSONArray programs = (JSONArray) ps; + for (int i = 0; i < programs.size() + && i < MAX_PROGRAMMES_PER_DAY; i++) { + JSONObject programme = programs.getJSONObject(i); + Programme p = programmeFromJSON(programme, + config.fetchDetails); + p.channel = c.getXmltvChannelId(); + result.add(p); + } + } else { + JSONObject programs = (JSONObject) ps; + int count = 0; + for (Object o : programs.keySet()) { + if (count > MAX_PROGRAMMES_PER_DAY) + break; + JSONObject programme = programs.getJSONObject(o.toString()); + Programme p = programmeFromJSON(programme, + config.fetchDetails); + p.channel = c.getXmltvChannelId(); + result.add(p); + count++; + } + } + } +*/ + return result; + } +/* + private Programme programmeFromJSON(JSONObject programme, + boolean fetchDetails) throws Exception { + String id = programme.getString("db_id"); + Programme result = cache.get(getName(), id); + boolean cached = (result != null); + if (result == null) { + stats.cacheMisses++; + result = new Programme(); + // Do this here, because we can only add to these fields. Pity if + // they're updated + result.addTitle(programme.getString("titel")); + String genre = programme.getString("genre"); + if (genre != null && !genre.isEmpty()) + result.addCategory(config.translateCategory(genre)); + String kijkwijzer = programme.getString("kijkwijzer"); + if (kijkwijzer != null && !kijkwijzer.isEmpty()) { + List list = parseKijkwijzer(kijkwijzer); + if (config.joinKijkwijzerRatings) { + // mythtv doesn't understand multiple tags + result.addRating("kijkwijzer", StringUtils.join(list, ",")); + } else { + for (String rating : list) { + result.addRating("kijkwijzer", rating); + } + } + // TODO add icon from HTML detail page + } + } else { + // System.out.println("From cache: " + + // programme.getString("titel")); + stats.cacheHits++; + } + logger.trace(" titel:" + programme.getString("titel")); + logger.trace("datum_start:" + programme.getString("datum_start")); + logger.trace(" datum_end:" + programme.getString("datum_end")); + logger.trace(" genre:" + programme.getString("genre")); + SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", + new Locale("nl")); + result.startTime = df.parse(programme.getString("datum_start")); + result.endTime = df.parse(programme.getString("datum_end")); + // TODO other fields + + if (fetchDetails && !cached) { + // TODO also read details if those have not been cached + fillDetails(id, result); + } + if (!cached) { + // FIXME where to do this? + cache.put(getName(), id, result); + } + logger.debug(result); + return result; + } +*/ + +/* + private void fillDetails(String id, Programme result) throws Exception { + try { + fillJSONDetails(id, result); + } catch (Exception e) { + logger.warn("Error fetching details for programme " + + result.toString()); + } + try { + fillScraperDetails(id, result); + } catch (Exception e) { + logger.warn("Error fetching details for programme " + + result.toString()); + } + + if ((result.secondaryTitles == null || result.secondaryTitles.isEmpty()) + && (!result.hasCategory("movies") && !result + .hasCategory("film"))) { + for (Programme.Title t : result.titles) { + String[] parts = t.title.split("\\s*:\\s*", 2); + if (parts.length >= 2 && parts[0].length() >= 5) { + logger.debug("Splitting title from \"" + t.title + + "\" to: \"" + parts[0].trim() + + "\"; sub-title: \"" + parts[1].trim() + "\""); + t.title = parts[0].trim(); + result.addSecondaryTitle(parts[1].trim()); + } + } + } + } +*/ + /* + * {"db_id":"12436404", "titel":"RTL Boulevard", "datum":"2012-03-30", + * "btijd":"23:45:00", "etijd":"00:40:00", "synop": + * "Amusementsprogramma Actualiteiten, vermaak en opinies met \u00e9\u00e9n of twee deskundigen, gasten of andere nieuwsmakers. In hoog tempo volgen afwisselende items en reportages elkaar op met de thema's showbizz, crime, royalty en lifestyle.

" + * , "kijkwijzer":"", "genre":"Amusement", + * "presentatie":"Winston Gerschtanowitz, Albert Verlinde", + * "acteursnamen_rolverdeling":"", "regisseur":"", "zender_id":"4"} + */ +/* + private void fillJSONDetails(String id, Programme result) throws Exception { + URL url = JSONDetailUrl(id); + JSONObject json = fetchJSON(url); + Set keys = json.keySet(); + for (String key : keys) { + String value = StringEscapeUtils.unescapeHtml(json.getString(key)); + if (value.isEmpty()) + continue; + if (key.equals("synop")) { + value = value.replaceAll("
", " ").replaceAll("
", " ") + .replaceAll("]*>", " ").replaceAll("

", " ") + .replaceAll("", " ") + .replaceAll("", " ").replaceAll("", " ") + .replaceAll("", " ").trim(); + if (value.isEmpty()) + continue; + result.addDescription(value); + } else if (key.equals("presentatie")) { + String[] parts = value.split(","); + for (String s : parts) { + result.addPresenter(s.trim()); + } + } else if (key.equals("acteursnamen_rolverdeling")) { + // TODO hoe zouden rollen kunnen worden aangegeven? Geen + // voorbeelden van gezien. + String[] parts = value.split(","); + for (String s : parts) { + result.addActor(s.trim()); + } + } else if (key.equals("regisseur")) { + String[] parts = value.split(","); + for (String s : parts) { + result.addDirector(s.trim()); + } + } else if (key.equals("kijkwijzer")) { + // TODO + } else if (key.equals("db_id")) { + // ignore + } else if (key.equals("titel")) { + // ignore + } else if (key.equals("datum")) { + // ignore + } else if (key.equals("btijd")) { + // ignore + } else if (key.equals("etijd")) { + // ignore + } else if (key.equals("genre")) { + // ignore + } else if (key.equals("zender_id")) { + // ignore + } else { + logger.warn("Unknown key in tvgids.nl json details: \"" + key + + "\""); + } + } + } +*/ + private void fillDetails(String detailUrl, Programme result) + throws Exception { + Pattern progInfoPattern = Pattern.compile( + "prog-info-content.*prog-info-footer", Pattern.DOTALL); + Pattern infoLinePattern = Pattern + .compile("
  • (.*?):(.*?)
  • "); + Pattern HDPattern = Pattern.compile("HD \\d+[ip]?"); + Pattern kijkwijzerPattern = Pattern + .compile("\"(.*?)\""); + + Document doc; + try { + doc = Jsoup.connect(detailUrl).get(); + } catch (IOException e) { + logger.error("Exception reading tvgids.tv detail for programme " + detailUrl, e); + return; + } + + Elements details = doc.select(".program-details dt"); + for(Element element: details) + { + //logger.debug(" " + element.nodeName() + ": " + element.text()); + Element next = element.nextElementSibling(); + //logger.debug(" > " + next.nodeName() + ": " + next.text()); + String key = element.text().toLowerCase(); + String value = next.text(); + if (key.equals("datum")) { + + } else if (key.equals("tijd")) { + + } else if (key.equals("genre")) { + + } else if (key.equals("deel-url")) { + result.addUrl(value); + logger.trace(element.toString()); + logger.trace(next.toString()); + } else if (key.equals("presentatie")) { + String[] presenters = value.split(","); + for(String presenter: presenters) { + result.addPresenter(presenter.trim()); + } + } else if (key.equals("jaar")) { + + } else if (key.equals("acteurs")) { + String[] actors = value.split(","); + for(String actor: actors) { + result.addActor(actor.trim()); + } + } else if (key.equals("regisseur")) { + result.addDirector(value); + } else if (key.equals("officiële website")) { + result.addUrl(value); + } else if (key.equals("twitter hashtag")) { + + } else if (key.equals("officiële twitter")) { + + } else if (key.equals("uitzending gemist")) { + //logger.debug("Uitzending gemist: \"" + value + "\""); + //logger.trace(element.toString()); + //logger.trace(next.toString()); + //logger.debug(" gemist URL: " + next.select("a[href]").attr("href")); + result.addUrl(next.select("a[href]").attr("href")); + } else if (key.equals("imdb")) { + logger.trace(element.toString()); + logger.trace(next.toString()); + } else { + logger.warn("Unknown details element \"" + key + "\": \"" + value + "\""); + } + } + + Elements descElements = doc.select(".section-item p"); + //logger.debug("Description: " + descElements.text() ); + +/* + URL url = HTMLDetailUrl(id); + String clob = fetchURL(url); + Matcher m = progInfoPattern.matcher(clob); + if (m.find()) { + String progInfo = m.group(); + Matcher m2 = infoLinePattern.matcher(progInfo); + while (m2.find()) { + logger.trace(" infoLine: " + m2.group()); + logger.trace(" key: " + m2.group(1)); + logger.trace(" value: " + m2.group(2)); + String key = m2.group(1).toLowerCase(); + String value = m2.group(2); + if (key.equals("bijzonderheden")) { + String[] list = value.split(","); + for (String item : list) { + if (item.toLowerCase().contains("teletekst")) { + result.addSubtitle("teletext"); + } else if (item.toLowerCase().contains("breedbeeld")) { + result.setVideoAspect("16:9"); + } else if (value.toLowerCase().contains("zwart")) { + result.setVideoColour(false); + } else if (value.toLowerCase().contains("stereo")) { + result.setAudioStereo("stereo"); + } else if (value.toLowerCase().contains("herhaling")) { + result.setPreviouslyShown(); + } else { + Matcher m3 = HDPattern.matcher(value); + if (m3.find()) { + result.setVideoQuality(m3.group()); + } else { + logger.warn(" Unknown value in 'bijzonderheden': " + + item); + } + } + } + } else { + // ignore other keys for now + } + Matcher m3 = kijkwijzerPattern.matcher(progInfo); + List kijkwijzer = new ArrayList(); + while (m3.find()) { + kijkwijzer.add(m3.group(1)); + } + if (!kijkwijzer.isEmpty()) { + // logger.debug(" kijkwijzer: " + kijkwijzer); + } + } + } + */ + } + + /** + * @param args + */ + public static void main(String[] args) { + Logger.getRootLogger().setLevel(Level.TRACE); + Config config = Config.getDefaultConfig(); + TvGidsTv gids = new TvGidsTv(config); + gids.clearCache(); + try { + List channels = gids.getChannels(); + System.out.println("Channels: " + channels); + XMLStreamWriter writer = XMLOutputFactory.newInstance() + .createXMLStreamWriter(new FileWriter("tvgids.tv.xml")); + writer.writeStartDocument(); + writer.writeCharacters("\n"); + writer.writeDTD(""); + writer.writeCharacters("\n"); + writer.writeStartElement("tv"); + // List my_channels = channels; + List my_channels = channels.subList(0, 2); + for (Channel c : channels) { + c.serialize(writer, true); + } + writer.flush(); + List programmes = gids.getProgrammes(my_channels, 2); + for (Programme p : programmes) { + p.serialize(writer); + } + writer.writeEndElement(); + writer.writeEndDocument(); + writer.flush(); + if (!config.quiet) { + EPGSource.Stats stats = gids.getStats(); + System.out.println("Number of programmes from cache: " + + stats.cacheHits); + System.out.println("Number of programmes fetched: " + + stats.cacheMisses); + System.out.println("Number of fetch errors: " + + stats.fetchErrors); + } + gids.close(); + } catch (Exception e) { + logger.error("Error in tvgids testing", e); + } + } + +}