Skip to content

Commit

Permalink
cleanup and subtitles if embedded
Browse files Browse the repository at this point in the history
  • Loading branch information
codingPF committed Feb 18, 2024
1 parent 7e6ef2c commit 6c1fe53
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import de.mediathekview.mlib.daten.Sender;
import de.mediathekview.mlib.messages.listener.MessageListener;
import de.mediathekview.mserver.base.config.MServerConfigManager;
import de.mediathekview.mserver.base.messages.ServerMessages;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.orfon.task.OrfOnAZTask;
import de.mediathekview.mserver.crawler.orfon.task.OrfOnEpisodeTask;
Expand Down Expand Up @@ -52,17 +53,23 @@ protected RecursiveTask<Set<Film>> createCrawlerTask() {
try {
// Sendungen Verpasst (letzten 14 Tage)
// TAG > Episode > Episode2Film
final Set<OrfOnVideoInfoDTO> epsiodesFromDay = processDayUrlsToCrawl();
allVideos.addAll(epsiodesFromDay);
//final Set<OrfOnVideoInfoDTO> epsiodesFromDay = processDayUrlsToCrawl();
//allVideos.addAll(epsiodesFromDay);
//printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size());
//getAndSetMaxCount(allVideos.size());
//
// Sendungen a-z
// Buchstabe > Episoden > Episode2Film
final Set<OrfOnVideoInfoDTO> videosFromTopics = processAZUrlsToCrawl();
allVideos.addAll(videosFromTopics);
printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size());
getAndSetMaxCount(allVideos.size());
//
// History (top categories) > children >
final Set<OrfOnVideoInfoDTO> historyVideos = processHistoryUrlToCrawl();
allVideos.addAll(historyVideos);
//final Set<OrfOnVideoInfoDTO> historyVideos = processHistoryUrlToCrawl();
//allVideos.addAll(historyVideos);
//printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size());
//getAndSetMaxCount(allVideos.size());
//
return new OrfOnVideoInfo2FilmTask(this, new ConcurrentLinkedQueue<>(allVideos));
} catch (final Exception ex) {
Expand Down Expand Up @@ -113,15 +120,21 @@ private Queue<OrfOnBreadCrumsUrlDTO> createAZUrlsToCrawl() {
private Set<OrfOnVideoInfoDTO> processHistoryUrlToCrawl() throws InterruptedException, ExecutionException {
final ForkJoinTask<Set<OrfOnBreadCrumsUrlDTO>> histroyTask = forkJoinPool.submit(new OrfOnHistoryTask(this, createHistoryUrlToCrawl()));
final Set<OrfOnBreadCrumsUrlDTO> historyChidrenUrls = histroyTask.get();
LOG.debug("Found {} entries in OrfOnHistoryTask ", historyChidrenUrls.size());
//
final ForkJoinTask<Set<OrfOnBreadCrumsUrlDTO>> historyChildrenTask = forkJoinPool.submit(new OrfOnHistoryChildrenTask(this, new ConcurrentLinkedQueue<>(historyChidrenUrls)));
final Set<OrfOnBreadCrumsUrlDTO> historyItemUrls = historyChildrenTask.get();
LOG.debug("Found {} entries in OrfOnHistoryChildrenTask ", historyItemUrls.size());
//
final ForkJoinTask<Set<OrfOnBreadCrumsUrlDTO>> historyItemTask = forkJoinPool.submit(new OrfOnHistoryVideoItemTask(this, new ConcurrentLinkedQueue<>(historyItemUrls)));
final Set<OrfOnBreadCrumsUrlDTO> historyEpisodesUrls = historyItemTask.get();
LOG.debug("Found {} entries in OrfOnHistoryVideoItemTask ", historyEpisodesUrls.size());
//
final ForkJoinTask<Set<OrfOnVideoInfoDTO>> historyEpisodeTask = forkJoinPool.submit(new OrfOnEpisodeTask(this, new ConcurrentLinkedQueue<>(historyEpisodesUrls)));
return historyEpisodeTask.get();
final Set<OrfOnVideoInfoDTO> historyEpisodeVideos = historyEpisodeTask.get();
LOG.debug("Found {} entries in OrfOnEpisodeTask ", historyEpisodeVideos.size());
//
return historyEpisodeVideos;
}

private Queue<OrfOnBreadCrumsUrlDTO> createHistoryUrlToCrawl() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Collection;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import de.mediathekview.mlib.daten.FilmUrl;
import de.mediathekview.mlib.daten.GeoLocations;
Expand All @@ -18,40 +19,46 @@ public class OrfOnVideoInfoDTO {
private Optional<String> title;
private Optional<String> titleWithDate;
private Optional<String> topic;
private Optional<String> topicForArchive;
private Optional<LocalDateTime> aired;
private Optional<Duration> duration;
private Optional<String> description;
private Optional<URL> website;
private Optional<Collection<GeoLocations>> georestriction;
private Optional<URL> subtitleSource;
private Optional<Map<Resolution, FilmUrl>> videoUrls;
private Optional<Set<URL>> subtitleUrls;

public OrfOnVideoInfoDTO(
Optional<String> id,
Optional<String> channel,
Optional<String> title,
Optional<String> titleWithDate,
Optional<String> topic,
Optional<String> topicForArchive,
Optional<LocalDateTime> aired,
Optional<Duration> duration,
Optional<String> description,
Optional<URL> website,
Optional<Collection<GeoLocations>> georestriction,
Optional<URL> subtitleSource,
Optional<Map<Resolution, FilmUrl>> videoUrls) {
Optional<Map<Resolution, FilmUrl>> videoUrls,
Optional<Set<URL>> subtitleUrls) {
super();
this.id = id;
this.channel = channel;
this.title = title;
this.titleWithDate = titleWithDate;
this.topic = topic;
this.topicForArchive = topicForArchive;
this.aired = aired;
this.duration = duration;
this.description = description;
this.website = website;
this.georestriction = georestriction;
this.subtitleSource = subtitleSource;
this.videoUrls = videoUrls;
this.subtitleUrls = subtitleUrls;
}

public Optional<String> getId() {
Expand All @@ -69,8 +76,8 @@ public Optional<String> getTitleWithDate() {
public Optional<String> getTopic() {
return topic;
}
public void setTopic(Optional<String> newTopic) {
topic = newTopic;
public Optional<String> getTopicForArchive() {
return topicForArchive;
}
public Optional<LocalDateTime> getAired() {
return aired;
Expand All @@ -93,6 +100,9 @@ public Optional<URL> getSubtitleSource() {
public Optional<Map<Resolution, FilmUrl>> getVideoUrls() {
return videoUrls;
}
public Optional<Set<URL>> getSubtitleUrls() {
return subtitleUrls;
}

@Override
public int hashCode() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.basic.PagedElementListDTO;
import de.mediathekview.mserver.crawler.orfon.OrfOnBreadCrumsUrlDTO;
import de.mediathekview.mserver.crawler.orfon.OrfOnConstants;

import java.lang.reflect.Type;
import java.util.Optional;
Expand All @@ -29,14 +30,13 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
PagedElementListDTO<OrfOnBreadCrumsUrlDTO> page = new PagedElementListDTO<>();
page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE));
//
if (jsonPage.has(TAG_ITEMS[0]) && jsonPage.get(TAG_ITEMS[0]).isJsonObject() &&
jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().has(TAG_ITEMS[1]) &&
jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().get(TAG_ITEMS[1]).isJsonArray()) {
for (JsonElement topic : jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().get(TAG_ITEMS[1]).getAsJsonArray()) {
Optional<String> id = JsonUtils.getElementValueAsString(topic, TAG_ITEM_ID);
Optional<String> url = JsonUtils.getElementValueAsString(topic, TAG_ITEM_EPISODES);
final Optional<JsonElement> items = JsonUtils.getElement(jsonPage, TAG_ITEMS);
if (items.isPresent() && items.get().isJsonArray()) {
for (JsonElement topic : items.get().getAsJsonArray()) {
final Optional<String> id = JsonUtils.getElementValueAsString(topic, TAG_ITEM_ID);
final Optional<String> url = JsonUtils.getElementValueAsString(topic, TAG_ITEM_EPISODES);
if (id.isPresent() && url.isPresent()) {
page.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), url.get()));
page.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), OrfOnConstants.createMaxLimmitUrl(url.get())));
} else {
LOG.debug("No episodes found in item {}", id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,18 @@
import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO;

import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -30,6 +33,7 @@ public class OrfOnEpisodeDeserializer implements JsonDeserializer<OrfOnVideoInfo
private static final String TAG_TITLE = "title";
private static final String TAG_TITLE_WITH_DATE = "share_subject";
private static final String TAG_TOPIC = "profile_title";
private static final String TAG_TOPIC_ARCHIVE = "sub_headline";
private static final String TAG_AIRED = "date";
private static final String TAG_DURATION = "duration_seconds";
private static final String TAG_DESCRIPTION = "description";
Expand All @@ -41,26 +45,33 @@ public class OrfOnEpisodeDeserializer implements JsonDeserializer<OrfOnVideoInfo
private static final String TAG_VIDEO = "sources";
private static final String TAG_VIDEO_QUALITY = "quality_key";
private static final String TAG_VIDEO_URL = "src";

//
private static final String[] TAG_SUBTITLE_SMI = {"_embedded", "subtitle", "sami_url"};
private static final String[] TAG_SUBTITLE_SRT = {"_embedded", "subtitle", "srt_url"};
private static final String[] TAG_SUBTITLE_TTML = {"_embedded", "subtitle", "ttml_url"};
private static final String[] TAG_SUBTITLE_VTT = {"_embedded", "subtitle", "vtt_url"};
private static final String[] TAG_SUBTITLE_XML = {"_embedded", "subtitle", "xml_url"};

@Override
public OrfOnVideoInfoDTO deserialize(
final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context)
throws JsonParseException {

OrfOnVideoInfoDTO aFilm = new OrfOnVideoInfoDTO(
JsonUtils.getElementValueAsString(jsonElement, TAG_ID),
JsonUtils.getElementValueAsString(jsonElement, TAG_CHANNEL),
JsonUtils.getElementValueAsString(jsonElement, TAG_TITLE),
JsonUtils.getElementValueAsString(jsonElement, TAG_TITLE_WITH_DATE),
JsonUtils.getElementValueAsString(jsonElement, TAG_TOPIC),
JsonUtils.getElementValueAsString(jsonElement, TAG_TOPIC_ARCHIVE),
parseAiredDate(JsonUtils.getElementValueAsString(jsonElement, TAG_AIRED)),
parseDuration(JsonUtils.getElementValueAsString(jsonElement, TAG_DURATION)),
JsonUtils.getElementValueAsString(jsonElement, TAG_DESCRIPTION),
parseWebsite(JsonUtils.getElementValueAsString(jsonElement, TAG_SHARE_BODY)),
parseGeoLocations(JsonUtils.getElementValueAsString(jsonElement, TAG_RIGHT)),
parseSubtitle(JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE)),
parseUrl(jsonElement)
parseSubtitleSource(JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE)),
parseUrl(jsonElement),
parseSubtitleUrls(jsonElement)

);
//LOG.debug("{}",jsonElement );

Expand All @@ -79,6 +90,10 @@ public OrfOnVideoInfoDTO deserialize(
LOG.debug("{}",jsonElement );
LOG.debug("############");
}
if (aFilm.getSubtitleSource().isPresent() && aFilm.getSubtitleUrls().isEmpty()) {
LOG.debug("getSubtitleSource but no getSubtitleUrls {}", aFilm.getId().get());
}

// "genre_title": "Wetter",
// "headline": "Wetter Tirol vom 05.01.2024",
// "profile_title": "Wetter Tirol",
Expand All @@ -90,7 +105,7 @@ public OrfOnVideoInfoDTO deserialize(
return aFilm;
}

private Optional<URL> parseSubtitle(Optional<String> text) {
private Optional<URL> parseSubtitleSource(Optional<String> text) {
Optional<URL> sub = Optional.empty();
if (text.isPresent()) {
try {
Expand All @@ -103,6 +118,29 @@ private Optional<URL> parseSubtitle(Optional<String> text) {

}


private Optional<Set<URL>> parseSubtitleUrls(JsonElement element) {
Set<URL> urls = new HashSet<>();
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_SMI).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(url -> urls.add(url)));
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_SRT).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(url -> urls.add(url)));
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_TTML).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(url -> urls.add(url)));
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_VTT).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(url -> urls.add(url)));
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_XML).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(url -> urls.add(url)));
if (urls.size() == 0) {
return Optional.empty();
}
return Optional.of(urls);
}

private Optional<URL> toURL(String aString) {
try {
return Optional.of(new URL(aString));
} catch (MalformedURLException e) {
LOG.debug("error converting {} to URL {}", aString, e);
}
return Optional.empty();
}

private Optional<Map<Resolution, FilmUrl>> parseUrl(JsonElement jsonElement) {

for (Map.Entry<String, JsonElement> entry : jsonElement.getAsJsonObject().getAsJsonObject(TAG_VIDEO).entrySet()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,9 @@
import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO;

import java.lang.reflect.Type;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.Optional;

public class OrfOnEpisodesDeserializer implements JsonDeserializer<PagedElementListDTO<OrfOnVideoInfoDTO>> {
private static final Logger LOG = LogManager.getLogger(OrfOnEpisodesDeserializer.class);
private static final String[] TAG_NEXT_PAGE = {"_links", "next", "href"};
private static final String[] TAG_ITEMS = {"_embedded", "items"};
private static final OrfOnEpisodeDeserializer itemDeserializer = new OrfOnEpisodeDeserializer();
Expand All @@ -27,14 +23,12 @@ public PagedElementListDTO<OrfOnVideoInfoDTO> deserialize(
PagedElementListDTO<OrfOnVideoInfoDTO> page = new PagedElementListDTO<OrfOnVideoInfoDTO>();
page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE));
//
//OrfOnEpisodeDeserializer itemDeserializer = new OrfOnEpisodeDeserializer();
if (jsonPage.has(TAG_ITEMS[0]) && jsonPage.get(TAG_ITEMS[0]).isJsonObject() &&
jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().has(TAG_ITEMS[1]) &&
jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().get(TAG_ITEMS[1]).isJsonArray()) {
for (JsonElement item : jsonPage.get(TAG_ITEMS[0]).getAsJsonObject().get(TAG_ITEMS[1]).getAsJsonArray()) {
page.addElement(itemDeserializer.deserialize(item, null, null));
}
}
final Optional<JsonElement> items = JsonUtils.getElement(jsonPage, TAG_ITEMS);
if (items.isPresent() && items.get().isJsonArray()) {
for (JsonElement item : items.get().getAsJsonArray()) {
page.addElement(itemDeserializer.deserialize(item, null, null));
}
}
return page;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ public PagedElementListDTO<CrawlerUrlDTO> deserialize(
PagedElementListDTO<CrawlerUrlDTO> page = new PagedElementListDTO<>();
page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE));
//
Optional<JsonElement> itemArray = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY);
final Optional<JsonElement> itemArray = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY);
if (itemArray.isPresent() && itemArray.get().isJsonArray()) {
for (JsonElement item : itemArray.get().getAsJsonArray()) {
Optional<String> url = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
final Optional<String> url = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
if (url.isPresent()) {
page.addElement(new CrawlerUrlDTO(JsonUtils.getElementValueAsString(item, TAG_TARGET_URL).get()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
PagedElementListDTO<OrfOnBreadCrumsUrlDTO> page = new PagedElementListDTO<>();
page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE));
//
Optional<JsonElement> itemArrayTop = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY);
if (itemArrayTop.isPresent() && itemArrayTop.get().isJsonArray()) {
for (JsonElement item : itemArrayTop.get().getAsJsonArray()) {
Optional<String> videoItemUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
Optional<String> childrenUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL2);
Optional<String> title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE);
Optional<JsonElement> itemArray = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY);
if (itemArray.isPresent() && itemArray.get().isJsonArray()) {
for (JsonElement item : itemArray.get().getAsJsonArray()) {
final Optional<String> videoItemUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
final Optional<String> childrenUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL2);
final Optional<String> title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE);
if (videoItemUrl.isPresent()) {
page.addElement(new OrfOnBreadCrumsUrlDTO(
title.orElse("MISSING TITLE"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
PagedElementListDTO<OrfOnBreadCrumsUrlDTO> page = new PagedElementListDTO<>();
page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE));
//
Optional<JsonElement> itemArrayTop = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_TOP);
final Optional<JsonElement> itemArrayTop = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_TOP);
if (itemArrayTop.isPresent() && itemArrayTop.get().isJsonArray()) {
page.addElements(parseSection(itemArrayTop.get().getAsJsonArray()).getElements());
}
//
Optional<JsonElement> itemArrayButtom = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_BUTTOM);
final Optional<JsonElement> itemArrayButtom = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_BUTTOM);
if (itemArrayButtom.isPresent() && itemArrayButtom.get().isJsonArray()) {
page.addElements(parseSection(itemArrayButtom.get().getAsJsonArray()).getElements());
}
Expand All @@ -45,8 +45,8 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> parseSection(JsonArray itemArray) {
PagedElementListDTO<OrfOnBreadCrumsUrlDTO> items = new PagedElementListDTO<>();
for (JsonElement item : itemArray) {
Optional<String> url = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
Optional<String> title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE);
final Optional<String> url = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL);
final Optional<String> title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE);
if (url.isPresent()) {
items.addElement(new OrfOnBreadCrumsUrlDTO(
title.orElse("EMPTY"),
Expand Down
Loading

0 comments on commit 6c1fe53

Please sign in to comment.