Skip to content

WIP refactor #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v4
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ Recommend using `dokuWikiDumper` on _modern_ filesystems, such as `ext4` or `btr

# For webmaster

If you don’t want your wiki to be archived, add the following to your `domain/robots.txt`:
We crawl every MediaWiki site (with 1.5s crawl-delay) every year and upload to the Internet Archive. If you don’t want your wiki to be archived, add the following to your `<domain>/robots.txt`:

```robots.txt
User-agent: dokuWikiDumper
Disallow: /
```

Our bots are running on the following IPs: [wikiteam3.txt](https://static.saveweb.org/bots_ips/wikiteam3.txt) (ips, contact) | [wikiteam3.ips.txt](https://static.saveweb.org/bots_ips/wikiteam3.ips.txt) (ips)

## Requirements

Expand Down
195 changes: 96 additions & 99 deletions dokuWikiDumper/dump/content/__init__.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,44 @@
import builtins
import os
import copy
from dataclasses import dataclass
import queue
import time
import threading
from typing import Callable, Optional

from bs4 import BeautifulSoup
from requests import Session

from dokuWikiDumper.exceptions import ActionEditDisabled, ActionEditTextareaNotFound, DispositionHeaderMissingError


from .revisions import getRevisions, getSourceEdit, getSourceExport, save_page_changes
from .titles import getTitles
from dokuWikiDumper.utils.util import loadTitles, smkdirs, uopen
from .revisions import get_revisions, get_source_edit, get_source_export, save_page_changes
from .titles import get_titles
from dokuWikiDumper.utils.util import load_titles, smkdirs, uopen
from dokuWikiDumper.utils.util import print_with_lock as print
from dokuWikiDumper.utils.config import running_config

sub_thread_error = None


@dataclass
class DumpPageParams:
dumpDir: str
getSource: Callable
title_index: int
title: str
doku_url: str
session: Session
current_only: bool

sub_thread_error = None

POSION = None

def dumpContent(doku_url: str = '', dumpDir: str = '', session: Session = None, skipTo: int = 0,
threads: int = 1, ignore_errors: bool = False, ignore_action_disabled_edit: bool = False,
ignore_disposition_header_missing: bool=False, current_only: bool = False):

def dump_content(*, doku_url: str, dumpDir: str, session: Session, skipTo: int = 0,
threads: int = 1, ignore_errors: bool = False, ignore_action_disabled_edit: bool = False, current_only: bool = False):
if not dumpDir:
raise ValueError('dumpDir must be set')

titles = loadTitles(titlesFilePath=dumpDir + '/dumpMeta/titles.txt')
titles = load_titles(titlesFilePath=dumpDir + '/dumpMeta/titles.txt')
if titles is None:
titles = getTitles(url=doku_url, session=session)
titles = get_titles(url=doku_url, session=session)
with uopen(dumpDir + '/dumpMeta/titles.txt', 'w') as f:
f.write('\n'.join(titles))
f.write('\n--END--\n')
Expand All @@ -38,124 +48,111 @@ def dumpContent(doku_url: str = '', dumpDir: str = '', session: Session = None,
return False

r1 = session.get(doku_url, params={'id': titles[0], 'do': 'export_raw'})
r2 = session.get(doku_url, params={'id': titles[0]})
r3 = session.get(doku_url, params={'id': titles[0], 'do': 'diff'})

getSource = getSourceExport
getSource = get_source_export
if 'html' in r1.headers['content-type']:
print('\nWarning: export_raw action not available, using edit action\n')
time.sleep(3)
getSource = getSourceEdit

soup = BeautifulSoup(r2.text, running_config.html_parser)
hidden_rev = soup.findAll(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})
use_hidden_rev = hidden_rev and hidden_rev[0]['value']
# TODO: what the `use_hidden_rev` is for?

soup = BeautifulSoup(r3.text, running_config.html_parser)
select_revs = soup.findAll(
'select', {
'class': 'quickselect', 'name': 'rev2[0]'})
# TODO: what the `select_revs` is for?

index_of_title = -1 # 0-based
getSource = get_source_edit

title_index = -1 # 0-based
if skipTo > 0:
index_of_title = skipTo - 2
title_index = skipTo - 2
titles = titles[skipTo-1:]

def try_dump_page(*args, **kwargs):
tasks_queue: queue.Queue[Optional[DumpPageParams]] = queue.Queue(maxsize=threads)

workers: list[threading.Thread] = []
# spawn workers
for _ in range(threads):
t = threading.Thread(target=dump_worker, args=(tasks_queue, ignore_errors, ignore_action_disabled_edit))
t.daemon = True
t.start()
workers.append(t)

task_templ = DumpPageParams(dumpDir=dumpDir, doku_url=doku_url, session=session, getSource=getSource, current_only=current_only,
title_index=-999, title="dokuwikidumper_placehold")

for title in titles:
if sub_thread_error:
raise sub_thread_error

title_index += 1
task = copy.copy(task_templ)
task.title_index = title_index
task.title = title
tasks_queue.put(task)
print('Content: (%d/%d): [[%s]] ...' % (title_index+1, len(titles), title))

for _ in range(threads):
tasks_queue.put(POSION)

tasks_queue.join()

for w in workers:
w.join()

if sub_thread_error:
raise sub_thread_error


def dump_worker(tasks_queue: queue.Queue[Optional[DumpPageParams]], ignore_errors: bool, ignore_action_disabled_edit: bool):
global sub_thread_error
while True:
task_or_posion = tasks_queue.get()
if task_or_posion is None:
tasks_queue.task_done()
break
task = task_or_posion

try:
dump_page(*args, **kwargs)
dump_page(task)
except Exception as e:
if isinstance(e, ActionEditDisabled) and ignore_action_disabled_edit:
print('[',args[2]+1,'] action disabled: edit. ignored')
print('[',task.title_index,'] action disabled: edit. ignored')
elif isinstance(e, ActionEditTextareaNotFound) and ignore_action_disabled_edit:
print('[',args[2]+1,'] action edit: textarea not found. ignored')
print('[',task.title_index,'] action edit: textarea not found. ignored')
elif not ignore_errors:
global sub_thread_error
sub_thread_error = e
raise e
else:
print('[',args[2]+1,']Error in sub thread: (', e, ') ignored')
for title in titles:
while threading.active_count() > threads:
time.sleep(0.1)
if sub_thread_error:
raise sub_thread_error
print('[',task.title_index,'] Error in sub thread: (', e, ') ignored')
finally:
tasks_queue.task_done()

index_of_title += 1
t = threading.Thread(target=try_dump_page, args=(dumpDir,
getSource,
index_of_title,
title,
doku_url,
session,
use_hidden_rev,
select_revs,
current_only,
ignore_disposition_header_missing,
))
print('Content: (%d/%d): [[%s]] ...' % (index_of_title+1, len(titles), title))
t.daemon = True
t.start()

while threading.active_count() > 1:
time.sleep(2)
print('Waiting for %d threads to finish' %
(threading.active_count() - 1), end='\r')


def dump_page(dumpDir: str,
getSource,
index_of_title: int,
title: str,
doku_url: str,
session: Session,
use_hidden_rev,
select_revs,
current_only: bool,
ignore_disposition_header_missing: bool):
srouce = getSource(doku_url, title, session=session,
ignore_disposition_header_missing=ignore_disposition_header_missing)
msg_header = '['+str(index_of_title + 1)+']: '
child_path = title.replace(':', '/')
def dump_page(task: DumpPageParams):
srouce = task.getSource(task.doku_url, task.title, session=task.session)
msg_header = '['+str(task.title_index + 1)+']: '
child_path = task.title.replace(':', '/')
child_path = child_path.lstrip('/')
child_path = '/'.join(child_path.split('/')[:-1])

smkdirs(dumpDir, '/pages/' + child_path)
with uopen(dumpDir + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
smkdirs(task.dumpDir, '/pages/' + child_path)
with uopen(task.dumpDir + '/pages/' + task.title.replace(':', '/') + '.txt', 'w') as f:
f.write(srouce)

if current_only:
print(msg_header, ' [[%s]] saved.' % (title))
if task.current_only:
print(msg_header, ' [[%s]] saved.' % (task.title))
return

revs = getRevisions(doku_url, title, use_hidden_rev,
select_revs, session=session, msg_header=msg_header)
revs = get_revisions(task.doku_url, task.title, session=task.session, msg_header=msg_header)


save_page_changes(dumpDir=dumpDir, child_path=child_path, title=title,
save_page_changes(dumpDir=task.dumpDir, child_path=child_path, title=task.title,
revs=revs, msg_header=msg_header)


for rev in revs[1:]:
if 'id' in rev and rev['id']:
try:
txt = getSource(doku_url, title, rev['id'], session=session,
ignore_disposition_header_missing=ignore_disposition_header_missing)
smkdirs(dumpDir, '/attic/' + child_path)
with uopen(dumpDir + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt', 'w') as f:
txt = task.getSource(task.doku_url, task.title, rev['id'], session=task.session)
smkdirs(task.dumpDir, '/attic/' + child_path)
with uopen(task.dumpDir + '/attic/' + task.title.replace(':', '/') + '.' + rev['id'] + '.txt', 'w') as f:
f.write(txt)
print(msg_header, ' Revision %s of [[%s]] saved.' % (
rev['id'], title))
rev['id'], task.title))
except DispositionHeaderMissingError:
print(msg_header, ' Revision %s of [[%s]] is empty. (probably deleted)' % (
rev['id'], title))
rev['id'], task.title))
else:
print(msg_header, ' Revision %s of [[%s]] failed: %s' % (rev['id'], title, 'Rev id not found (please check ?do=revisions of this page)'))


# time.sleep(1.5)

print(msg_header, ' Revision %s of [[%s]] failed: %s' % (rev['id'], task.title, 'Rev id not found (please check ?do=revisions of this page)'))
Loading