Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions snscrape/modules/telegram.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
url = rawUrl.replace('//t.me/', '//t.me/s/')
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
if (message := post.find('div', class_ = 'tgme_widget_message_text')):
content = message.text
content = self.get_post_text(message)
outlinks = []
for link in post.find_all('a'):
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
Expand Down Expand Up @@ -143,6 +143,14 @@ def get_items(self):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')

@staticmethod
def get_post_text(post) -> str:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be message, not post, to avoid confusion with the variable in _soup_to_items. This should also not be public API. So _get_message_text(message).

result = []
# Using the features of the BS4 module itself
for s in post.stripped_strings:
result.append(s)
return '\n'.join(result)
Comment on lines +148 to +152

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be simplified to '\n'.join(post.stripped_strings), but it doesn't do the right thing anyway. It splits out links into separate lines, and it doesn't preserve multiple line breaks. A good test case for both is https://t.me/telegram/201. Looks like this might require explicitly replacing the <br> tags.


def _get_entity(self):
kwargs = {}
# /channel has a more accurate member count and bigger profile picture
Expand Down Expand Up @@ -200,4 +208,4 @@ def _cli_setup_parser(cls, subparser):

@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.channel)
return cls._cli_construct(args, args.channel)
Comment on lines -203 to +211

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still an undesired whitespace change. There should be a LF at the end of a (text-ish) file.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Looks like GitHub doesn't display this correctly on the PR page itself, only in the full diff: https://github.com/JustAnotherArchivist/snscrape/pull/687/files#diff-7f40c11448f92ed2f5d1764136d372d15faa3d4da0272813e88478c4d8870a09L203)