Skip to content

Bridge request for Nikkei Asia #4153

Open
@ghost

Description

Bridge request

Bridge for Nikkei Asia.

General information

  • Host URI for the bridge (i.e. https://asia.nikkei.com):

  • Which information would you like to see?

Articles

  • How should the information be displayed/formatted?

  • Which of the following parameters do you expect?

    • Title
    • URI (link to the original article)
    • Author
    • Timestamp
    • Content (the content of the article)
    • Enclosures (pictures, videos, etc...)
    • Categories (categories, tags, etc...)

Options

  • Limit number of returned items
    • Default limit: 20
  • Load full articles
    • Cache articles (articles are stored in a local cache on first request): yes
    • Cache timeout (max = 24 hours): 1 hour
  • Balance requests (RSS-Bridge uses cached versions to reduce bandwith usage)
    • Timeout (default = 5 minutes, max = 24 hours): 5 minutes

Additional notes

  • Fetch from rss feed
  • Fetch article full content
  • Remove ads
<?php

class NikkeiBridge extends BridgeAbstract
{
  const NAME = 'Nikkei Bridge';
  const URI = 'https://asia.nikkei.com';
  const DESCRIPTION = 'Fetches the latest articles from the Nikkei Asia';
  const MAINTAINER = 'notme';
  const CACHE_TIMEOUT = 3600;

  const MAX_CONTENTS = 20;

  public function collectData()
  {
    $rssFeedUrl = 'https://asia.nikkei.com/rss/feed/nar';
    $rssContent = file_get_contents($rssFeedUrl);

    if (!$rssContent) {
      returnServerError('Could not request ' . $rssFeedUrl);
    }

    $rss = simplexml_load_string($rssContent);

    if (!$rss) {
      returnServerError('Could not parse RSS feed from ' . $rssFeedUrl);
    }

    $count = 0;
    foreach ($rss->item as $element) {
      if ($count >= self::MAX_CONTENTS) {
        break;
      }

      $count++;

      $item = [];
      $item['title'] = (string)$element->title;
      $item['uri'] = (string)$element->link;
      $item['timestamp'] = strtotime((string)$element->pubDate);

      // Fetch the article content
      $articleContent = $this->fetchArticleContent($item['uri']);
      if ($articleContent) {
        $item['content'] = $articleContent;
      } else {
        $item['content'] = 'Content could not be retrieved';
      }

      $this->items[] = $item;
    }
  }

  private function fetchArticleContent($url)
  {
    // Extract the path from the URL
    $urlComponents = parse_url($url);
    $path = $urlComponents['path'];

    // Base64 encode the path
    $encodedPath = base64_encode($path);

    // Create the API URL
    $apiUrl = 'https://asia.nikkei.com/__service/v1/piano/article_access/' . $encodedPath;

    // Fetch the JSON content from the API
    $apiResponse = file_get_contents($apiUrl);

    if (!$apiResponse) {
      error_log('Could not request ' . $apiUrl);
      return null;
    }

    $apiResponseData = json_decode($apiResponse, true);

    if (!isset($apiResponseData['body'])) {
      error_log('Invalid API response for ' . $apiUrl);
      return null;
    }

    // Load the HTML content
    $htmlContent = $apiResponseData['body'];

    // Remove elements with class o-ads
    $dom = new DOMDocument;
    libxml_use_internal_errors(true);
    $dom->loadHTML($htmlContent);
    libxml_clear_errors();

    $xpath = new DOMXPath($dom);
    foreach ($xpath->query('//*[contains(@class, "o-ads")]') as $adsNode) {
      $adsNode->parentNode->removeChild($adsNode);
    }

    // Save the cleaned HTML content
    $cleanedHtml = $dom->saveHTML();

    return $cleanedHtml;
  }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions