diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd2f3d2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +shopify_app_store/__pycache__ +shopify_app_store/spiders/__pycache__ diff --git a/requirements.txt b/requirements.txt index 35e58b4..8a6c2f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -Scrapy==2.11.2 +Scrapy==2.14.1 beautifulsoup4==4.12.3 pandas==2.2.2 +rich>=14.0.0 diff --git a/shopify_app_store/pipelines.py b/shopify_app_store/pipelines.py index 10a9c94..32dd1bf 100644 --- a/shopify_app_store/pipelines.py +++ b/shopify_app_store/pipelines.py @@ -24,25 +24,18 @@ def open_spider(self, spider): def process_item(self, item, spider): if isinstance(item, App): self.store_app(item) - return None - if isinstance(item, PricingPlan): + elif isinstance(item, PricingPlan): self.store_pricing_plan(item) - return None - if isinstance(item, PricingPlanFeature): + elif isinstance(item, PricingPlanFeature): self.store_pricing_plan_feature(item) - return None - if isinstance(item, Category): + elif isinstance(item, Category): self.store_category(item) - return None - if isinstance(item, AppCategory): + elif isinstance(item, AppCategory): self.store_app_category(item) - return None - if isinstance(item, KeyBenefit): + elif isinstance(item, KeyBenefit): self.store_key_benefit(item) - return None - if isinstance(item, AppReview): + elif isinstance(item, AppReview): self.store_app_review(item) - return None return item diff --git a/shopify_app_store/rich_ui.py b/shopify_app_store/rich_ui.py new file mode 100644 index 0000000..1abba44 --- /dev/null +++ b/shopify_app_store/rich_ui.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +"""Rich interactive terminal dashboard for Scrapy spider.""" + +import time +from collections import deque +from datetime import timedelta + +from rich.console import Console +from rich.layout import Layout +from rich.live import Live +from rich.panel import Panel +from rich.progress import Progress, BarColumn, TextColumn, SpinnerColumn, TimeElapsedColumn +from rich.table import Table +from rich.text import Text + +from scrapy import signals + + +class RichDashboard: + """Scrapy Extension that provides a live Rich terminal dashboard.""" + + def __init__(self, crawler): + self.crawler = crawler + self.console = Console() + + # Counters + self.total_apps = 0 + self.scraped_apps = 0 + self.skipped_apps = 0 + self.error_count = 0 + self.rate_limit_count = 0 + self.retry_count = 0 + + # Item counters + self.item_counts = { + 'App': 0, + 'KeyBenefit': 0, + 'PricingPlan': 0, + 'PricingPlanFeature': 0, + 'Category': 0, + 'AppCategory': 0, + 'AppReview': 0, + } + + # Activity log (last 8 entries) + self.activity_log = deque(maxlen=8) + + # Timing + self.start_time = None + + # Rich Live display + self.live = None + self.progress = None + self.app_task_id = None + + @classmethod + def from_crawler(cls, crawler): + ext = cls(crawler) + crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) + crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) + crawler.signals.connect(ext.spider_error, signal=signals.spider_error) + crawler.signals.connect(ext.response_received, signal=signals.response_received) + return ext + + def spider_opened(self, spider): + self.start_time = time.time() + spider._rich_ui = self + self.live = Live(self._build_layout(), console=self.console, refresh_per_second=2) + self.live.start() + + def spider_closed(self, spider, reason): + if self.live: + self.live.stop() + + # Print final summary + elapsed = time.time() - self.start_time + self.console.print() + self.console.print(Panel( + self._build_summary(elapsed, reason), + title="[bold cyan]🛍️ Scraping Complete[/]", + border_style="cyan", + )) + + def item_scraped(self, item, response, spider): + class_name = type(item).__name__ + if class_name in self.item_counts: + self.item_counts[class_name] += 1 + self._refresh() + + def spider_error(self, failure, response, spider): + self.error_count += 1 + app_name = response.url.split('/')[-1] if response else 'unknown' + error_msg = str(failure.value)[:60] if failure else 'Unknown error' + self.activity_log.append(('error', app_name, error_msg)) + self._refresh() + + def response_received(self, response, request, spider): + if response.status == 429: + self.rate_limit_count += 1 + app_name = request.url.split('/')[-1] + self.activity_log.append(('rate_limit', app_name, '429 Rate Limited')) + self._refresh() + + def notify_scraped(self, app_url): + """Called by spider when an app page is successfully parsed.""" + self.scraped_apps += 1 + app_name = app_url.rstrip('/').split('/')[-1] + item_count = sum(self.item_counts.values()) + self.activity_log.append(('success', app_name, f'{self.item_counts["App"]} apps total')) + self._refresh() + + def notify_skipped(self): + """Called by spider when an app is skipped (unchanged since last scrape).""" + self.skipped_apps += 1 + self._refresh() + + def set_total_apps(self, count): + """Called by spider after parsing sitemap to set total app count.""" + self.total_apps = count + self._refresh() + + def _refresh(self): + if self.live: + self.live.update(self._build_layout()) + + def _build_layout(self): + """Build the full dashboard layout.""" + elapsed = time.time() - self.start_time if self.start_time else 0 + elapsed_str = str(timedelta(seconds=int(elapsed))) + + # Speed calculation + speed = (self.scraped_apps / elapsed * 60) if elapsed > 0 else 0 + + # === Header === + header = Text("🛍️ Shopify App Store Scraper", style="bold cyan") + + # === Progress Section === + progress_table = Table.grid(padding=(0, 1)) + progress_table.add_column(ratio=1) + + # App progress bar + if self.total_apps > 0: + pct = (self.scraped_apps + self.skipped_apps) / self.total_apps * 100 + filled = int(pct / 100 * 30) + bar = f"[green]{'█' * filled}[/][dim]{'░' * (30 - filled)}[/]" + progress_text = f" Apps {bar} [bold]{self.scraped_apps + self.skipped_apps:,}[/] / [bold]{self.total_apps:,}[/] [dim]({pct:.1f}%)[/]" + else: + progress_text = " Apps [dim]Waiting for sitemap...[/]" + + progress_table.add_row(progress_text) + + # === Stats Row === + stats = Table.grid(padding=(0, 2)) + stats.add_column() + stats.add_column() + stats.add_column() + stats.add_column() + stats.add_row( + f" ⏱ Elapsed: [bold]{elapsed_str}[/]", + f"📊 Speed: [bold cyan]{speed:.0f}[/] apps/min", + f"⏭ Skipped: [bold yellow]{self.skipped_apps:,}[/]", + f"🔄 Retries: [bold]{self.retry_count}[/]", + ) + stats.add_row( + f" ✅ Scraped: [bold green]{self.scraped_apps:,}[/]", + f"📦 Items: [bold cyan]{sum(self.item_counts.values()):,}[/]", + f"⚠️ Errors: [bold red]{self.error_count}[/]", + f"🚫 429s: [bold red]{self.rate_limit_count}[/]", + ) + + # === Item Counts Table === + items_table = Table( + title="📊 Item Counts", + show_header=True, + header_style="bold", + expand=True, + title_style="bold white", + padding=(0, 1), + ) + items_table.add_column("Type", style="cyan", ratio=2) + items_table.add_column("Count", justify="right", style="bold green", ratio=1) + + for item_type, count in self.item_counts.items(): + emoji = { + 'App': '📱', 'KeyBenefit': '✨', 'PricingPlan': '💰', + 'PricingPlanFeature': '📋', 'Category': '🏷️', + 'AppCategory': '🔗', 'AppReview': '⭐', + }.get(item_type, '•') + items_table.add_row(f"{emoji} {item_type}", f"{count:,}") + + # === Activity Log === + activity_lines = [] + for entry_type, name, detail in self.activity_log: + if entry_type == 'success': + activity_lines.append(f" [green]✓[/] {name:<35} [dim]{detail}[/]") + elif entry_type == 'error': + activity_lines.append(f" [red]✗[/] {name:<35} [red]{detail}[/]") + elif entry_type == 'rate_limit': + activity_lines.append(f" [yellow]⚠[/] {name:<35} [yellow]{detail}[/]") + elif entry_type == 'skip': + activity_lines.append(f" [dim]⏭[/] {name:<35} [dim]{detail}[/]") + + if not activity_lines: + activity_lines.append(" [dim]Waiting for first response...[/]") + + activity_text = "\n".join(activity_lines) + + # === Compose Layout === + layout_parts = Table.grid(padding=(0, 0)) + layout_parts.add_column(ratio=1) + + layout_parts.add_row("") + layout_parts.add_row(progress_text) + layout_parts.add_row("") + layout_parts.add_row(stats) + layout_parts.add_row("") + + # Side-by-side: items + activity + side_by_side = Table.grid(padding=(0, 1)) + side_by_side.add_column(ratio=2) + side_by_side.add_column(ratio=3) + side_by_side.add_row( + items_table, + Panel(activity_text, title="[bold]📝 Recent Activity[/]", border_style="dim"), + ) + layout_parts.add_row(side_by_side) + + return Panel( + layout_parts, + title=f"[bold cyan]{header}[/]", + border_style="cyan", + padding=(0, 1), + ) + + def _build_summary(self, elapsed, reason): + """Build the final summary panel content.""" + elapsed_str = str(timedelta(seconds=int(elapsed))) + speed = (self.scraped_apps / elapsed * 60) if elapsed > 0 else 0 + + summary = Table.grid(padding=(0, 2)) + summary.add_column() + summary.add_column() + + summary.add_row(f" Reason: [bold]{reason}[/]", f"Duration: [bold]{elapsed_str}[/]") + summary.add_row( + f" Scraped: [bold green]{self.scraped_apps:,}[/] apps", + f"Skipped: [bold yellow]{self.skipped_apps:,}[/] apps", + ) + summary.add_row( + f" Total Items: [bold cyan]{sum(self.item_counts.values()):,}[/]", + f"Speed: [bold]{speed:.0f}[/] apps/min", + ) + summary.add_row( + f" Errors: [bold red]{self.error_count}[/]", + f"Rate Limits: [bold red]{self.rate_limit_count}[/]", + ) + + items_detail = " | ".join( + f"{k}: [bold]{v:,}[/]" for k, v in self.item_counts.items() if v > 0 + ) + if items_detail: + summary.add_row("", "") + summary.add_row(f" [dim]{items_detail}[/]", "") + + return summary diff --git a/shopify_app_store/settings.py b/shopify_app_store/settings.py index 974d154..b4c1e53 100644 --- a/shopify_app_store/settings.py +++ b/shopify_app_store/settings.py @@ -58,9 +58,13 @@ # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} +EXTENSIONS = { + 'shopify_app_store.rich_ui.RichDashboard': 100, +} + +# Suppress default Scrapy logging (Rich UI handles display) +LOG_LEVEL = 'WARNING' +LOG_FORMAT = '%(message)s' # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/shopify_app_store/spiders/app_store.py b/shopify_app_store/spiders/app_store.py index 3e52c42..fa8cbd1 100644 --- a/shopify_app_store/spiders/app_store.py +++ b/shopify_app_store/spiders/app_store.py @@ -28,27 +28,44 @@ class AppStoreSpider(LastmodSpider): processed_reviews = {} def start_requests(self): - # Fetch existing apps from CSV - apps = pd.read_csv('{}{}{}'.format('./', WriteToCSV.OUTPUT_DIR, 'apps.csv')) - for _, app in apps.iterrows(): - self.processed_apps[app['url']] = {'url': app['url'], 'lastmod': app['lastmod'], 'id': app['id']} - - self.processed_reviews = pd.read_csv('{}{}{}'.format('./', WriteToCSV.OUTPUT_DIR, 'reviews.csv')) + # Fetch existing apps from CSV (deduplicate on load) + try: + apps = pd.read_csv('{}{}{}'.format('./', WriteToCSV.OUTPUT_DIR, 'apps.csv')) + apps = apps.drop_duplicates(subset=['id'], keep='last') + for _, app in apps.iterrows(): + self.processed_apps[app['url']] = {'url': app['url'], 'lastmod': app['lastmod'], 'id': app['id']} + self.logger.info('Loaded %d existing apps from CSV for resume', len(self.processed_apps)) + except (FileNotFoundError, pd.errors.EmptyDataError): + self.logger.info('No existing apps.csv found, starting fresh') + + try: + self.processed_reviews = pd.read_csv('{}{}{}'.format('./', WriteToCSV.OUTPUT_DIR, 'reviews.csv')) + self.processed_reviews = self.processed_reviews.drop_duplicates( + subset=['app_id', 'author', 'posted_at'], keep='last' + ) + self.logger.info('Loaded %d existing reviews from CSV for resume', len(self.processed_reviews)) + except (FileNotFoundError, pd.errors.EmptyDataError): + self.processed_reviews = pd.DataFrame(columns=['app_id', 'author', 'rating', 'posted_at', 'body']) + self.logger.info('No existing reviews.csv found, starting fresh') for url in self.sitemap_urls: yield Request(url, self._parse_sitemap) def parse(self, response): - app_id = str(uuid.uuid4()) app_url = response.url persisted_app = self.processed_apps.get(app_url, None) if persisted_app is not None: + # Always reuse the existing ID for this app URL + app_id = persisted_app.get('id', str(uuid.uuid4())) if persisted_app.get('lastmod') != response.meta['lastmod']: - self.logger.info('App\'s page got updated since %s, taking the existing id %s | URL: %s', - persisted_app.get('lastmod'), persisted_app.get('id'), app_url) - # Take id of the existing app - app_id = persisted_app.get('id', app_id) + self.logger.info('App\'s page got updated since %s, reusing id %s | URL: %s', + persisted_app.get('lastmod'), app_id, app_url) + else: + self.logger.info('Re-scraping app with same lastmod, reusing id %s | URL: %s', + app_id, app_url) + else: + app_id = str(uuid.uuid4()) response.meta['app_id'] = app_id self.processed_apps[app_url] = { @@ -60,6 +77,10 @@ def parse(self, response): for scraped_item in self.parse_app(response): yield scraped_item + # Notify Rich UI dashboard + if hasattr(self, '_rich_ui'): + self._rich_ui.notify_scraped(app_url) + reviews_url = '{}{}'.format(app_url, '/reviews') yield Request(reviews_url, callback=self.parse_reviews, meta={'app_id': app_id, 'lastmod': response.meta['lastmod'], 'skip_if_first_scraped': True}) @@ -100,8 +121,8 @@ def parse_app(self, response): url = response.request.url title = response.css('#adp-hero h1 ::text').extract_first(default='').strip() - developer = response.css('#adp-hero a[href^=\/partners]::text').extract_first().strip() - developer_link = 'https://{}{}'.format(self.BASE_DOMAIN, response.css('#adp-hero a[href^=\/partners]::attr(href)').extract_first().strip()) + developer = response.css('#adp-hero a[href*="/partners/"]::text').extract_first().strip() + developer_link = 'https://{}{}'.format(self.BASE_DOMAIN, response.css('#adp-hero a[href*="/partners/"]::attr(href)').extract_first().strip()) icon = response.css('#adp-hero img::attr(src)').extract_first() rating = response.css('#adp-hero dd > span.tw-text-fg-secondary ::text').extract_first() reviews_count_raw = response.css('#reviews-link::text').extract_first(default='0 Reviews') @@ -109,7 +130,7 @@ def parse_app(self, response): description_raw = response.css('#app-details').extract_first() description = ' '.join(response.css('#app-details ::text').extract()).strip() tagline = None - pricing_hint = response.css('#adp-hero > div > div.tw-grow.tw-flex.tw-flex-col.tw-gap-xl > dl > div:nth-child(1) > dd > div.tw-hidden.sm\:tw-block.tw-text-pretty ::text').extract_first().strip() + pricing_hint = response.css(r'#adp-hero > div > div.tw-grow.tw-flex.tw-flex-col.tw-gap-xl > dl > div:nth-child(1) > dd > div.tw-hidden.sm\:tw-block.tw-text-pretty ::text').extract_first().strip() for benefit in response.css('#app-details>ul>li'): yield KeyBenefit(app_id=app_id, diff --git a/shopify_app_store/spiders/lastmod_spider.py b/shopify_app_store/spiders/lastmod_spider.py index ced666d..e97552d 100644 --- a/shopify_app_store/spiders/lastmod_spider.py +++ b/shopify_app_store/spiders/lastmod_spider.py @@ -34,16 +34,33 @@ def _parse_sitemap(self, response): if any(x.search(entry['loc']) for x in self._follow): yield scrapy.Request(entry['loc'], callback=self._parse_sitemap) elif s.type == 'urlset': - for entry in it: + entries = list(it) + # Count total matching app URLs and pre-count skipped ones for Rich UI + matching_count = 0 + will_skip_count = 0 + for entry in entries: + if any(r.search(entry['loc']) for r, c in self._cbs): + matching_count += 1 + if self._is_loc_same_as_processed(entry['loc'], entry.get('lastmod')): + will_skip_count += 1 + + if not hasattr(self, '_total_counted') and hasattr(self, '_rich_ui'): + self._rich_ui.set_total_apps(matching_count) + # Pre-set skipped count so progress bar starts at correct % when resuming + self._rich_ui.skipped_apps = will_skip_count + self._total_counted = True + + for entry in entries: for r, c in self._cbs: if r.search(entry['loc']): app_url = entry['loc'] - if self._is_loc_same_as_processed(app_url, entry['lastmod']): + if self._is_loc_same_as_processed(app_url, entry.get('lastmod')): self.logger.info('Skipping app as it hasn\'t changed since %s | URL: %s', - entry['lastmod'], + entry.get('lastmod'), entry['loc']) # Skip apps which were scraped and haven't changed since they were added to the list continue yield scrapy.Request(entry['loc'], callback=c, meta={'lastmod': entry['lastmod']}) break +