diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/ListLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/ListLinkHandlerFactory.java index 9ea478b02c..7b04078c9b 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/ListLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/ListLinkHandlerFactory.java @@ -73,7 +73,7 @@ public ListLinkHandler fromQuery(String id, * however it should not be overridden by the actual implementation. * * @param id - * @return the url coresponding to id without any filters applied + * @return the url corresponding to id without any filters applied */ public String getUrl(String id) throws ParsingException { return getUrl(id, new ArrayList(0), ""); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/SearchQueryHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/SearchQueryHandlerFactory.java index 50977e20c1..d46670c9ba 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/SearchQueryHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/linkhandler/SearchQueryHandlerFactory.java @@ -12,7 +12,7 @@ public abstract class SearchQueryHandlerFactory extends ListLinkHandlerFactory { /////////////////////////////////// @Override - public abstract String getUrl(String querry, List contentFilter, String sortFilter) throws ParsingException; + public abstract String getUrl(String query, List contentFilter, String sortFilter) throws ParsingException; public String getSearchString(String url) { return ""; @@ -28,21 +28,21 @@ public String getId(String url) { } @Override - public SearchQueryHandler fromQuery(String querry, + public SearchQueryHandler fromQuery(String query, List contentFilter, String sortFilter) throws ParsingException { - return new SearchQueryHandler(super.fromQuery(querry, contentFilter, sortFilter)); + return new SearchQueryHandler(super.fromQuery(query, contentFilter, sortFilter)); } - public SearchQueryHandler fromQuery(String querry) throws ParsingException { - return fromQuery(querry, new ArrayList(0), ""); + public SearchQueryHandler fromQuery(String query) throws ParsingException { + return fromQuery(query, new ArrayList(0), ""); } /** * It's not mandatory for NewPipe to handle the Url * * @param url - * @return + * @return if we should accept the url */ @Override public boolean onAcceptUrl(String url) { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/localization/AbbreviationHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/localization/AbbreviationHelper.java new file mode 100644 index 0000000000..f0a7c9edcf --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/localization/AbbreviationHelper.java @@ -0,0 +1,157 @@ +package org.schabi.newpipe.extractor.localization; + +import java.util.HashMap; + + +/** + * Map matching abbreviations with their English equivalents + *

+ * Created by B0pol on 2020-02-16. + * + *

+ * By using this map, you can replace the abbreviations used for numbers in the 80 languages supported by YouTube + * With their English equivalent. + *

+ *

+ * Some language use more abbreviations for numbers: east-asian languages have abbreviations for ten thousand, + * and hundred million, indo-arabic languages have abbreviations for a hundred thousand and ten million, + * then we replace ten thousand by {@link #tenThousandAbbreviation}, + * hundred thousand by {@link #hundredThousandAbbreviation}, + * ten million by {@link #tenMillionAbbreviation}, + * hundred million by {@link #hundredMillionAbbreviation}. + *

+ *

+ * The languages using the abbreviation is commented with the language code at the left. + * + * @see Wikipedia page of language codes + *

+ */ +public class AbbreviationHelper { + + //should be safe until someone has 1 billion subscribers on YouTube + public static final HashMap abbreviationSubscribersCount = new HashMap<>(); + public static final String englishMillionAbbreviation = "M"; + public static final String englishThousandAbbreviation = "K"; + + public static final String tenThousandAbbreviation = "万"; + public static final String hundredThousandAbbreviation = "ল"; + public static final String tenMillionAbbreviation = "ক"; + public static final String hundredMillionAbbreviation = "億"; + + static { + abbreviationSubscribersCount.put(englishThousandAbbreviation, englishThousandAbbreviation); //az, iw, en, ro + abbreviationSubscribersCount.put(englishMillionAbbreviation, englishMillionAbbreviation); //iw, en, ca, es, eu, and many more + abbreviationSubscribersCount.put(tenMillionAbbreviation, tenMillionAbbreviation); + + abbreviationSubscribersCount.put("k", englishThousandAbbreviation); //af, no + abbreviationSubscribersCount.put("ሺ", englishThousandAbbreviation); //am + abbreviationSubscribersCount.put("ألف", englishThousandAbbreviation); //ar + abbreviationSubscribersCount.put("тыс", englishThousandAbbreviation); //be, ru + abbreviationSubscribersCount.put("хил", englishThousandAbbreviation); //bg + abbreviationSubscribersCount.put("হা", englishThousandAbbreviation); //bn + abbreviationSubscribersCount.put("hilj", englishThousandAbbreviation); //bs, sr + abbreviationSubscribersCount.put("tis", englishThousandAbbreviation); //cs, hr, sk, sl + abbreviationSubscribersCount.put("χιλ", englishThousandAbbreviation); //el + abbreviationSubscribersCount.put("tuh", englishThousandAbbreviation); //et + abbreviationSubscribersCount.put("هزار", englishThousandAbbreviation); //fa + abbreviationSubscribersCount.put("t", englishThousandAbbreviation); //fi + abbreviationSubscribersCount.put("હજાર", englishThousandAbbreviation); //gu + abbreviationSubscribersCount.put("हज़ार", englishThousandAbbreviation); //hi + abbreviationSubscribersCount.put("E", englishThousandAbbreviation); //hu + abbreviationSubscribersCount.put("հզր", englishThousandAbbreviation); //hy + abbreviationSubscribersCount.put("rb", englishThousandAbbreviation); //id + abbreviationSubscribersCount.put("þ", englishThousandAbbreviation); //is + abbreviationSubscribersCount.put("ათ", englishThousandAbbreviation); //ka + abbreviationSubscribersCount.put("мың", englishThousandAbbreviation); //kk + abbreviationSubscribersCount.put("м", englishThousandAbbreviation); //kk + abbreviationSubscribersCount.put("ពាន់", englishThousandAbbreviation); //km + abbreviationSubscribersCount.put("ಸಾ", englishThousandAbbreviation); //kn + abbreviationSubscribersCount.put("천", englishThousandAbbreviation); //ko + abbreviationSubscribersCount.put("миң", englishThousandAbbreviation); //ky + abbreviationSubscribersCount.put("ກີບ", englishThousandAbbreviation); //lo + abbreviationSubscribersCount.put("ພັນ", englishThousandAbbreviation); //lo + abbreviationSubscribersCount.put("tūkst", englishThousandAbbreviation); //lt, lv + abbreviationSubscribersCount.put("илј", englishThousandAbbreviation); //mk + abbreviationSubscribersCount.put("мянга", englishThousandAbbreviation); //mn + abbreviationSubscribersCount.put("ह", englishThousandAbbreviation); //mr + abbreviationSubscribersCount.put("ထောင်", englishThousandAbbreviation); //my + abbreviationSubscribersCount.put("हजार", englishThousandAbbreviation); //ne + abbreviationSubscribersCount.put("ਹਜ਼ਾਰ", englishThousandAbbreviation); //pa + abbreviationSubscribersCount.put("tys", englishThousandAbbreviation); //pl + abbreviationSubscribersCount.put("ද", englishThousandAbbreviation); //si + abbreviationSubscribersCount.put("mijë", englishThousandAbbreviation); //sq + abbreviationSubscribersCount.put("хиљ", englishThousandAbbreviation); //sr-Latn + abbreviationSubscribersCount.put("elfu", englishThousandAbbreviation); //sw + abbreviationSubscribersCount.put("ஆ", englishThousandAbbreviation); //ta + abbreviationSubscribersCount.put("వే", englishThousandAbbreviation); //te + abbreviationSubscribersCount.put("พัน", englishThousandAbbreviation); //th + abbreviationSubscribersCount.put("B", englishThousandAbbreviation); //tr + abbreviationSubscribersCount.put("тис", englishThousandAbbreviation); //uk + abbreviationSubscribersCount.put("ہزار", englishThousandAbbreviation); //ur + abbreviationSubscribersCount.put("ming", englishThousandAbbreviation); //uz + abbreviationSubscribersCount.put("N", englishThousandAbbreviation); //vi + + abbreviationSubscribersCount.put("m", englishMillionAbbreviation); //af, is + abbreviationSubscribersCount.put(" م", englishMillionAbbreviation); //an + abbreviationSubscribersCount.put("ሜ", englishMillionAbbreviation); //am + abbreviationSubscribersCount.put("ሜትር", englishMillionAbbreviation); //am + abbreviationSubscribersCount.put("مليون", englishMillionAbbreviation); //ar + abbreviationSubscribersCount.put("mln", englishMillionAbbreviation); //az, et, lt, nl, pl, sq, uz + abbreviationSubscribersCount.put("млн", englishMillionAbbreviation); //be, bg, kk, ky, ru, uk + abbreviationSubscribersCount.put("mil", englishMillionAbbreviation); //bs, cs, hr, ro, sk, sr-Latn + abbreviationSubscribersCount.put("mio", englishMillionAbbreviation); //da, sl + abbreviationSubscribersCount.put("Mio", englishMillionAbbreviation); //de + abbreviationSubscribersCount.put("εκ", englishMillionAbbreviation); //el + abbreviationSubscribersCount.put("میلیون", englishMillionAbbreviation); //fa + abbreviationSubscribersCount.put("م", englishMillionAbbreviation); //fa + abbreviationSubscribersCount.put("milj", englishMillionAbbreviation); //fi, lv + abbreviationSubscribersCount.put("մլն", englishMillionAbbreviation); //hy + abbreviationSubscribersCount.put("jt", englishMillionAbbreviation); //id + abbreviationSubscribersCount.put("Mln", englishMillionAbbreviation); //it + abbreviationSubscribersCount.put("მლნ", englishMillionAbbreviation); //ka + abbreviationSubscribersCount.put("លាន", englishMillionAbbreviation); //km + abbreviationSubscribersCount.put("ಮಿ", englishMillionAbbreviation); //kn + abbreviationSubscribersCount.put("ລ້ານ", englishMillionAbbreviation); //lo + abbreviationSubscribersCount.put("М", englishMillionAbbreviation); //mk + abbreviationSubscribersCount.put("мил", englishMillionAbbreviation); //mk, sr + abbreviationSubscribersCount.put("сая", englishMillionAbbreviation); //mn + abbreviationSubscribersCount.put("J", englishMillionAbbreviation); //ms + abbreviationSubscribersCount.put("သန်း", englishMillionAbbreviation); //my + abbreviationSubscribersCount.put("mill", englishMillionAbbreviation); //no + abbreviationSubscribersCount.put("mi", englishMillionAbbreviation); //pt + abbreviationSubscribersCount.put("මි", englishMillionAbbreviation); //si + abbreviationSubscribersCount.put("mn", englishMillionAbbreviation); //sv + abbreviationSubscribersCount.put("మి", englishMillionAbbreviation); //te + abbreviationSubscribersCount.put("மி", englishMillionAbbreviation); //ta + abbreviationSubscribersCount.put("ล้าน", englishMillionAbbreviation); //th + abbreviationSubscribersCount.put("Mn", englishMillionAbbreviation); //tr + abbreviationSubscribersCount.put("Tr", englishMillionAbbreviation); //vi + + abbreviationSubscribersCount.put("만", tenThousandAbbreviation); //ko + abbreviationSubscribersCount.put("万", tenThousandAbbreviation); //ja, zh-CN + abbreviationSubscribersCount.put("萬", tenThousandAbbreviation); //zh-TW + abbreviationSubscribersCount.put("သောင်း", tenThousandAbbreviation); //my + abbreviationSubscribersCount.put("หมื่น", tenThousandAbbreviation); //th + + abbreviationSubscribersCount.put("লা", hundredThousandAbbreviation); //bn + abbreviationSubscribersCount.put("લાખ", hundredThousandAbbreviation); //gu + abbreviationSubscribersCount.put("लाख", hundredThousandAbbreviation); //hi, mr, ne + abbreviationSubscribersCount.put("ਲੱਖ", hundredThousandAbbreviation); //pa + abbreviationSubscribersCount.put("لاکھ", hundredThousandAbbreviation); //ur + abbreviationSubscribersCount.put("သိန်း", hundredThousandAbbreviation); //my + abbreviationSubscribersCount.put("แสน", hundredThousandAbbreviation); //th + + abbreviationSubscribersCount.put("কো", tenMillionAbbreviation); //bn + abbreviationSubscribersCount.put("કરોડ", tenMillionAbbreviation); //gu + abbreviationSubscribersCount.put("क॰", tenMillionAbbreviation); //hi + abbreviationSubscribersCount.put("कोटी", tenMillionAbbreviation); //mr + abbreviationSubscribersCount.put("ကုဋေ", tenMillionAbbreviation); //my + abbreviationSubscribersCount.put("करोड", tenMillionAbbreviation); //ne + abbreviationSubscribersCount.put("ਕਰੋੜ", tenMillionAbbreviation); //pa + abbreviationSubscribersCount.put("کروڑ", tenMillionAbbreviation); //ur + + abbreviationSubscribersCount.put("億", hundredMillionAbbreviation); //ja, zh-TW + abbreviationSubscribersCount.put("억", hundredMillionAbbreviation); //ko + abbreviationSubscribersCount.put("亿", hundredMillionAbbreviation); //zh-CN + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java index eae3bcb9b4..d4c679693f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeService.java @@ -152,14 +152,13 @@ public CommentsExtractor getCommentsExtractor(ListLinkHandler urlIdHandler) // https://www.youtube.com/picker_ajax?action_language_json=1 private static final List SUPPORTED_LANGUAGES = Localization.listFrom( - "en-GB" - /*"af", "am", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "da", "de", + "af", "am", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "da", "de", "el", "en", "en-GB", "es", "es-419", "es-US", "et", "eu", "fa", "fi", "fil", "fr", "fr-CA", "gl", "gu", "hi", "hr", "hu", "hy", "id", "is", "it", "iw", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "pa", "pl", "pt", "pt-PT", "ro", "ru", "si", "sk", "sl", "sq", "sr", "sr-Latn", "sv", "sw", "ta", "te", "th", "tr", - "uk", "ur", "uz", "vi", "zh-CN", "zh-HK", "zh-TW", "zu"*/ + "uk", "ur", "uz", "vi", "zh-CN", "zh-HK", "zh-TW", "zu" ); // https://www.youtube.com/picker_ajax?action_country_json=1 diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java index cc37cbdab7..dc9ccc5ee3 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java @@ -18,11 +18,12 @@ import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; import org.schabi.newpipe.extractor.utils.Parser; -import org.schabi.newpipe.extractor.utils.Utils; import javax.annotation.Nonnull; import java.io.IOException; +import static org.schabi.newpipe.extractor.utils.Utils.mixedNumberWordToLong; + /* * Created by Christian Schabesberger on 25.07.16. * @@ -81,7 +82,8 @@ public String getUrl() throws ParsingException { public String getId() throws ParsingException { try { return doc.select("meta[itemprop=\"channelId\"]").first().attr("content"); - } catch (Exception ignored) {} + } catch (Exception ignored) { + } // fallback method; does not work with channels that have no "Subscribe" button (e.g. EminemVEVO) try { @@ -137,19 +139,18 @@ public String getFeedUrl() throws ParsingException { @Override public long getSubscriberCount() throws ParsingException { - + long subCount = -1; final Element el = doc.select("span[class*=\"yt-subscription-button-subscriber-count\"]").first(); if (el != null) { + // If the element is null, the channel have the subscriber count disabled String elTitle = el.attr("title"); try { - return Utils.mixedNumberWordToLong(elTitle); + subCount = mixedNumberWordToLong(elTitle, getExtractorLocalization()); } catch (NumberFormatException e) { throw new ParsingException("Could not get subscriber count", e); } - } else { - // If the element is null, the channel have the subscriber count disabled - return -1; } + return subCount; } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index 9568c7ff59..5bc29157dc 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -41,6 +41,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import static org.schabi.newpipe.extractor.utils.JsonUtils.getString; + /* * Created by Christian Schabesberger on 06.08.15. * @@ -107,8 +109,7 @@ public YoutubeStreamExtractor(StreamingService service, LinkHandler linkHandler) public String getName() throws ParsingException { assertPageFetched(); try { - return playerResponse.getObject("videoDetails").getString("title"); - + return getString(playerResponse.getObject("microformat").getObject("playerMicroformatRenderer").getObject("title"), "simpleText"); } catch (Exception e) { // fallback HTML method String name = null; @@ -185,12 +186,11 @@ public String getThumbnailUrl() throws ParsingException { public Description getDescription() throws ParsingException { assertPageFetched(); try { - // first try to get html-formatted description - return new Description(parseHtmlAndGetFullLinks(doc.select("p[id=\"eow-description\"]").first().html()), Description.HTML); + //JSON first because formatting is better, see https://github.com/TeamNewPipe/NewPipeExtractor/pull/257#discussion_r379828770 + return new Description(getString(playerResponse.getObject("microformat").getObject("playerMicroformatRenderer").getObject("description"), "simpleText"), Description.PLAIN_TEXT); } catch (Exception e) { try { - // fallback to raw non-html description - return new Description(playerResponse.getObject("videoDetails").getString("shortDescription"), Description.PLAIN_TEXT); + return new Description(parseHtmlAndGetFullLinks(doc.select("p[id=\"eow-description\"]").first().html()), Description.HTML); } catch (Exception ignored) { throw new ParsingException("Could not get the description", e); } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java index ebd0ba16a8..bf5ba34010 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java @@ -1,6 +1,7 @@ package org.schabi.newpipe.extractor.utils; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.Localization; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; @@ -8,8 +9,13 @@ import java.net.URLDecoder; import java.util.List; +import static org.schabi.newpipe.extractor.localization.AbbreviationHelper.abbreviationSubscribersCount; + public class Utils { + private static final String HTTP = "http://"; + private static final String HTTPS = "https://"; + private Utils() { //no instance } @@ -27,6 +33,24 @@ public static String removeNonDigitCharacters(String toRemove) { return toRemove.replaceAll("\\D+", ""); } + /** + *

Remove a number from a string.

+ *

Examples:

+ *
    + *
  • "123" -> ""
  • + *
  • "1.23K" -> "K"
  • + *
  • "1.23 M" -> " M"
  • + *
+ * Pay attention, it may remove the final dot. + * eg: "8,93 хил." -> " хил" + * + * @param toRemove string to remove a number + * @return a string that contains only not a number + */ + public static String removeNumber(String toRemove) { + return toRemove.replaceAll("[0-9,.]", ""); + } + /** *

Convert a mixed number word to a long.

*

Examples:

@@ -44,22 +68,84 @@ public static String removeNonDigitCharacters(String toRemove) { public static long mixedNumberWordToLong(String numberWord) throws NumberFormatException, ParsingException { String multiplier = ""; try { - multiplier = Parser.matchGroup("[\\d]+([\\.,][\\d]+)?([KMBkmb])+", numberWord, 2); - } catch(ParsingException ignored) {} + multiplier = Parser.matchGroup("[\\d]+([\\.,][\\d]+)?([KMBkmb万লক億])+", numberWord, 2); + } catch (ParsingException ignored) { + } double count = Double.parseDouble(Parser.matchGroup1("([\\d]+([\\.,][\\d]+)?)", numberWord) .replace(",", ".")); switch (multiplier.toUpperCase()) { case "K": - return (long) (count * 1e3); + return (long) (count * 1000); + case "万": //10K, used by east-asian languages + return (long) (count * 10_000); + case "ল": //100K, used by indo-arabic languages + return (long) (count * 100_000); case "M": - return (long) (count * 1e6); + return (long) (count * 1_000_000); + case "ক": //10M, used by indo-arabic languages + return (long) (count * 10_000_000); + case "億": //100M, used by east-asian languages + return (long) (count * 100_000_000); case "B": - return (long) (count * 1e9); + return (long) (count * 1_000_000_000); default: return (long) (count); } } + public static String removeWhiteSpaces(String s) { + return s.replaceAll("(\\s| | )", ""); + } + + /** + * Does the same as {@link #mixedNumberWordToLong(String)}, but for the 80 languages supported by YouTube. + * + * @param numberWord string to be converted to a long + * @param loc a {@link Localization} + * @return a long + * @throws ParsingException + */ + public static long mixedNumberWordToLong(String numberWord, Localization loc) throws ParsingException { + numberWord = removeWhiteSpaces(numberWord); + String langCode = loc.getLanguageCode(); + String abbreviation = removeNumber(numberWord); + + //special case for portugal, "mil" is the abbreviation for thousand, but is Million for many other languages + if (langCode.equals("pt") && abbreviation.equals("mil")) { + numberWord = numberWord.replace("mil", "K"); + } else if (langCode.equals("ca") && abbreviation.equals("m")) { //same for catalan but for "m" + numberWord = numberWord.replace("m", "K"); + } + //special case for languages written right to left + else if (langCode.equals("sw") && abbreviation.equals("elfu")) { + numberWord = moveAtRight("elfu", numberWord); + } else if (langCode.equals("si")) { + numberWord = moveAtRight(abbreviation, numberWord); + } + + try { //special cases where it gives a number directly for some languages, or with a dot or a comma + String maybeAlreadyNumber = numberWord.replaceAll("([.,])", ""); + return Long.parseLong(maybeAlreadyNumber); + } catch (NumberFormatException e) { + //the number had an abbreviation, so it will be handled below + } + + if (!langCode.equals("en")) { + try { + numberWord = numberWord.replace(abbreviation, abbreviationSubscribersCount.get(abbreviation)); + } catch (NullPointerException e) { + throw new ParsingException("The abbreviation \"" + abbreviation + "\" is missing in AbbreviationHelper map"); + } + } + return mixedNumberWordToLong(numberWord); + } + + public static String moveAtRight(String toMove, String whole) { + whole = whole.replace(toMove, ""); + whole += toMove; + return whole; + } + /** * Check if the url matches the pattern. * @@ -83,9 +169,6 @@ public static void printErrors(List errors) { } } - private static final String HTTP = "http://"; - private static final String HTTPS = "https://"; - public static String replaceHttpWithHttps(final String url) { if (url == null) return null; diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeChannelExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeChannelExtractorTest.java index 317bd4fa49..fbb4647ab6 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeChannelExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeChannelExtractorTest.java @@ -7,6 +7,7 @@ import org.schabi.newpipe.extractor.ServiceList; import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.services.BaseChannelExtractorTest; import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeChannelExtractor; @@ -24,12 +25,17 @@ public static class Gronkh implements BaseChannelExtractorTest { @BeforeClass public static void setUp() throws Exception { - NewPipe.init(DownloaderTestImpl.getInstance()); + NewPipe.init(DownloaderTestImpl.getInstance(), new Localization("ru")); extractor = (YoutubeChannelExtractor) YouTube .getChannelExtractor("http://www.youtube.com/user/Gronkh"); extractor.fetchPage(); } + @Test + public void testGetSubscribersCount() throws Exception { + assertTrue(extractor.getSubscriberCount() >= 4880000); + } + /*////////////////////////////////////////////////////////////////////////// // Extractor //////////////////////////////////////////////////////////////////////////*/ diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeSubscriberTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeSubscriberTest.java new file mode 100644 index 0000000000..fb5b04a876 --- /dev/null +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeSubscriberTest.java @@ -0,0 +1,219 @@ +package org.schabi.newpipe.extractor.services.youtube; + +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import org.junit.Ignore; +import org.junit.Test; +import org.schabi.newpipe.DownloaderTestImpl; +import org.schabi.newpipe.extractor.NewPipe; +import org.schabi.newpipe.extractor.downloader.Downloader; +import org.schabi.newpipe.extractor.downloader.Response; +import org.schabi.newpipe.extractor.exceptions.ExtractionException; +import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; +import org.schabi.newpipe.extractor.localization.Localization; +import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeChannelExtractor; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.schabi.newpipe.extractor.ServiceList.YouTube; +import static org.schabi.newpipe.extractor.localization.AbbreviationHelper.abbreviationSubscribersCount; +import static org.schabi.newpipe.extractor.utils.Utils.removeNumber; +import static org.schabi.newpipe.extractor.utils.Utils.removeWhiteSpaces; + +/** + * A class that tests abbreviations and subscriber counts for all the languages YouTube supports. + */ +@Ignore("Should be ran manually from time to time, as it's too time consuming.") +public class YoutubeSubscriberTest { + + private static final String url = "https://www.youtube.com/feed/guide_builder"; + private static final int PAUSE_DURATION_EXTRACTORS = 250; + private static final int PAUSE_DURATION_ABBREVIATIONS = 125; + + public static String getAbbreviation(String count) { + return removeNumber(removeWhiteSpaces(count)); + } + + public static void assertEqualsWithEnglish(String channelUrl) throws ExtractionException, IOException, InterruptedException { + NewPipe.init(DownloaderTestImpl.getInstance(), new Localization("en")); + YoutubeChannelExtractor extractorEnglish = (YoutubeChannelExtractor) YouTube + .getChannelExtractor(channelUrl); + extractorEnglish.fetchPage(); + long englishSubCount = extractorEnglish.getSubscriberCount(); + Localization localization; + for (int z = 0; z < YouTube.getSupportedLocalizations().size(); z++) { + localization = YouTube.getSupportedLocalizations().get(z); + System.out.println("Current localization: " + localization); + NewPipe.init(DownloaderTestImpl.getInstance(), localization); + YoutubeChannelExtractor extractor = (YoutubeChannelExtractor) YouTube + .getChannelExtractor(channelUrl); + extractor.fetchPage(); + + long subcriberCount = extractor.getSubscriberCount(); + if (subcriberCount == -1) { + System.err.println("Subscriber count for " + localization.toString() + " was -1;\n" + + "If the channel doesn't have the subscribers disabled, it was probably a failed request"); + } else { + assertEquals("Language that failed:" + localization.toString() + ".\nWe", englishSubCount, subcriberCount); + } + Thread.sleep(PAUSE_DURATION_EXTRACTORS); + } + } + + public static void assertEqualsWithEnglish(String channelUrl, Localization loc) throws ExtractionException, IOException { + //for only one language + NewPipe.init(DownloaderTestImpl.getInstance(), new Localization("en")); + YoutubeChannelExtractor extractorEnglish = (YoutubeChannelExtractor) YouTube + .getChannelExtractor(channelUrl); + extractorEnglish.fetchPage(); + long englishSubCount = extractorEnglish.getSubscriberCount(); + + NewPipe.init(DownloaderTestImpl.getInstance(), loc); + YoutubeChannelExtractor extractor = (YoutubeChannelExtractor) YouTube + .getChannelExtractor(channelUrl); + extractor.fetchPage(); + assertEquals(englishSubCount, extractor.getSubscriberCount()); + } + + public static void assertEqualsWithEnglish(String channelUrl, String languageCode) throws ExtractionException, IOException { + assertEqualsWithEnglish(channelUrl, new Localization(languageCode)); + + } + + public void runTest(Document doc, Localization localisation) throws ParsingException { + String currentSubscriberCountString; + String currentChannelName; + + Elements elements = doc.select(".yt-subscriber-count"); + for (int i = 0; i < elements.size(); i++) { + currentSubscriberCountString = doc.select(".yt-subscriber-count").get(i).attr("title"); + currentChannelName = doc.select(".yt-ui-ellipsis.yt-ui-ellipsis-2.yt-uix-sessionlink").get(i).attr("title"); + String abbreviation = getAbbreviation(currentSubscriberCountString); + try { + abbreviation = abbreviation.replace(abbreviation, abbreviationSubscribersCount.get(abbreviation)); + } catch (NullPointerException e) { + if (!abbreviation.isEmpty()) { + throw new ParsingException("This should be a real failed test. Abbreviation=\"" + abbreviation + "\"" + + "\nLocalization : " + localisation + + "\nOriginal string gathered from YouTube =\"" + currentSubscriberCountString + "\"" + + "\nTitle of the channel (probably wrong):" + currentChannelName); + } else { + //see if it's not one of the languages giving a number directly + try { //special cases where it gives a number directly for some languages, or with a dot or a comma + String maybeAlreadyNumber = currentSubscriberCountString.replaceAll("([ ., ])", ""); + long count = Long.parseLong(maybeAlreadyNumber); + } catch (NumberFormatException x) { + System.err.println("The abbreviation is empty, this is probably a failed request" + + "Localization :" + localisation); + } + } + } + } + } + + /* + ======================== + TESTS FOR ABBREVIATIONS + ======================== + */ + + @Test + public void testOneLanguageAbbreviations() throws IOException, ReCaptchaException, ParsingException, InterruptedException { + Localization loc = new Localization("ms"); + //change the value of loc if you wanna test a specific language. + + NewPipe.init(DownloaderTestImpl.getInstance(), loc); + Downloader dl = NewPipe.getDownloader(); + Response response = dl.get(url); + Document doc = YoutubeParsingHelper.parseAndCheckPage(url, response); + + /* + Uncomment this if you want to view the html file in your browser, search for the subscriber count given by the Exception. + You'll get the real channel name (because the one given by Exception is often desynchronised). + run with your browser, and you'll see how much subscribers the channel have + then you can know the abbreviation given by the exception correspond to (eg a million, a thousand, 10 thousand…) + add it in the AbbreviationHelper.java map. + */ +// String pathToYTBTests = "src/test/java/org/schabi/newpipe/extractor/services/youtube/"; +// createFile(pathToYTBTests +"DELETEME_failTestYTBsubscriber" + loc.toString() + ".html", doc.toString()); + runTest(doc, loc); + } + + @Test + public void testAllLanguagesAbbreviations() throws IOException, ReCaptchaException, InterruptedException, ParsingException { + List docs = new ArrayList<>(); + int totalCount = 0; + Localization localization; + + for (int z = 0; z < YouTube.getSupportedLocalizations().size(); z++) { + localization = YouTube.getSupportedLocalizations().get(z); + System.out.println("Current localization: " + localization); + NewPipe.init(DownloaderTestImpl.getInstance(), localization); + Downloader dl = NewPipe.getDownloader(); + Response response = dl.get(url); + Document doc = YoutubeParsingHelper.parseAndCheckPage(url, response); + docs.add(doc); + runTest(doc, localization); + totalCount += doc.select(".yt-subscriber-count").size(); + Thread.sleep(PAUSE_DURATION_ABBREVIATIONS); //slowed down a bit to decrease reCAPTCHAs rate and false negatives + } + System.out.println("docs size: " + docs.size()); + System.out.println("total count (should be around 112*80=8960)" + totalCount); + } + + /* + ======================== + TESTS WITH THE EXTRACTOR + There are often false positives (the test with all languages often fail, but if you try the failed language + it will be ok. Increase PAUSE_DURATION_EXTRACTORS to prevent false positives. + ======================== + */ + + @Test + public void testDisabled() throws IOException, ExtractionException, InterruptedException { + //every languages should give -1 + Localization localization; + for (int z = 0; z < YouTube.getSupportedLocalizations().size(); z++) { + localization = YouTube.getSupportedLocalizations().get(z); + System.out.println("Current localization: " + localization); + NewPipe.init(DownloaderTestImpl.getInstance(), localization); + YoutubeChannelExtractor extractor = (YoutubeChannelExtractor) YouTube + .getChannelExtractor("https://www.youtube.com/user/EminemVEVO/"); + extractor.fetchPage(); + + long subscriberCount = extractor.getSubscriberCount(); + assertEquals("Language that failed: " + localization.toString() + "\n We ", -1, subscriberCount); + Thread.sleep(PAUSE_DURATION_EXTRACTORS); + } + } + + //don't use invidious links, they take more time and the tests fail more + private static final String highestSubsUrl = "https://www.youtube.com/user/tseries"; + private static final String selenaGomezUrl = "https://www.youtube.com/channel/UCPNxhDvTcytIdvwXWAm43cA"; + private static final String franjoUrl = "https://www.youtube.com/channel/UC53gfTiWvslLPNuoDcoxmVg"; + + @Test + public void testOneLanguageExtractor() throws ExtractionException, IOException { + assertEqualsWithEnglish(franjoUrl, "ms"); + } + + @Test + public void testHighestSubsOnYoutube() throws ExtractionException, IOException, InterruptedException { + assertEqualsWithEnglish(highestSubsUrl); + } + + @Test + public void testSelenaGomez() throws InterruptedException, ExtractionException, IOException { + assertEqualsWithEnglish(selenaGomezUrl); + } + + @Test + public void testFranjo() throws InterruptedException, ExtractionException, IOException { + assertEqualsWithEnglish(franjoUrl); + } +} diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java index 2798e8db5f..c31d9a06b5 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java @@ -9,6 +9,7 @@ import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeStreamExtractor; import org.schabi.newpipe.extractor.stream.*; import org.schabi.newpipe.extractor.utils.Utils; @@ -249,15 +250,28 @@ public void testGetDescription() throws ParsingException { @Test public void testGetFullLinksInDescription() throws ParsingException { - assertTrue(extractor.getDescription().getContent().contains("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); - assertTrue(extractor.getDescription().getContent().contains("https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); - assertTrue(extractor.getDescription().getContent().contains("https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); - assertTrue(extractor.getDescription().getContent().contains("https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + Description description = extractor.getDescription(); + String content = description.getContent(); + if (description.getType() == Description.HTML) { + //we should have full links + assertTrue(content.contains("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + } else { + //type == PLAIN_TEXT, we should have shortened links + assertTrue(content.contains("https://youtu.be/X7FLCHVXpsA?list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://youtu.be/Lqv6G0pDNnw?list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://youtu.be/XxaRBPyrnBU?list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + assertTrue(content.contains("https://youtu.be/U-9tUEOFKNU?list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34")); + } - assertFalse(extractor.getDescription().getContent().contains("https://youtu.be/X7FLCHVXpsA?list=PL7...")); - assertFalse(extractor.getDescription().getContent().contains("https://youtu.be/Lqv6G0pDNnw?list=PL7...")); - assertFalse(extractor.getDescription().getContent().contains("https://youtu.be/XxaRBPyrnBU?list=PL7...")); - assertFalse(extractor.getDescription().getContent().contains("https://youtu.be/U-9tUEOFKNU?list=PL7...")); + //we should NEVER have broken shortened links, that could be given by HTML + // YoutubeStreamExtractor.parseHtmlAndGetFullLinks fix the fact we could have broken link, so we test this function here. + assertFalse(content.contains("https://youtu.be/X7FLCHVXpsA?list=PL7...")); + assertFalse(content.contains("https://youtu.be/Lqv6G0pDNnw?list=PL7...")); + assertFalse(content.contains("https://youtu.be/XxaRBPyrnBU?list=PL7...")); + assertFalse(content.contains("https://youtu.be/U-9tUEOFKNU?list=PL7...")); } } @@ -308,4 +322,26 @@ public void testGetFrames() throws ExtractionException { } } } + + public static class LocalizationTest { + private static YoutubeStreamExtractor extractor; + + @BeforeClass + public static void setUp() throws Exception { + NewPipe.init(DownloaderTestImpl.getInstance(), new Localization("de")); + extractor = (YoutubeStreamExtractor) YouTube + .getStreamExtractor("https://www.youtube.com/watch?v=BWQ0BFVuSXA"); + extractor.fetchPage(); + } + + @Test + public void testGetName() throws ParsingException { + assertEquals("SKAM FRANCE EP.6 S5: Sonntag, 16:21 Uhr - Was jetzt?", extractor.getName()); + } + + @Test + public void testGetDescription() throws ParsingException { + assertTrue(extractor.getDescription().getContent().contains("Folgen Sie france.tv auf:")); + } + } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/utils/UtilsTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/utils/UtilsTest.java index 5788674458..9a2264f941 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/utils/UtilsTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/utils/UtilsTest.java @@ -1,9 +1,18 @@ package org.schabi.newpipe.extractor.utils; +import com.grack.nanojson.JsonObject; import com.grack.nanojson.JsonParserException; +import com.grack.nanojson.JsonWriter; import org.junit.Test; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + import static org.junit.Assert.assertEquals; public class UtilsTest { @@ -15,4 +24,31 @@ public void testMixedNumberWordToLong() throws JsonParserException, ParsingExcep assertEquals(10.5e6, Utils.mixedNumberWordToLong("10,5M"), 0.0); assertEquals(1.5e9, Utils.mixedNumberWordToLong("1,5B"), 0.0); } -} + + public static void createFile(String path, String content) throws IOException { + String[] dirs = path.split("/"); + if (dirs.length > 1) { + String pathWithoutFileName = path.replace(dirs[dirs.length - 1], ""); + if (!Files.exists(Paths.get(pathWithoutFileName))) { //create dirs if they don't exist + new File(pathWithoutFileName).mkdirs(); + } + } + writeFile(path, content); + } + + //lower lever createFile. Doesn't create directories and takes only a String + public static void writeFile(String path, String content) throws IOException { + BufferedWriter writer = new BufferedWriter(new FileWriter(path)); + writer.write(content); + writer.flush(); + writer.close(); + } + + public static String jsonObjToString(JsonObject object) { + return JsonWriter.string(object); + } + + public static void createFile(String path, JsonObject content) throws IOException { + createFile(path, jsonObjToString(content)); + } +} \ No newline at end of file