Skip to content

Commit 9f14849

Browse files
committed
Ubuntu 24, new mirror server
1 parent c2e21bd commit 9f14849

File tree

4 files changed

+23
-39
lines changed

4 files changed

+23
-39
lines changed

install_dependencies.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
11
#!/bin/bash
22

33
#
4-
# Tested on Ubuntu-22
4+
# Tested on Ubuntu-24
55
#
66

7-
sudo apt-get install -y postgresql-14
7+
sudo apt-get install -y postgresql-16
88
sudo -u postgres createuser -s $USER
99

10-
1110
# No not significant performance increase above 250MB
12-
sudo -u postgres mkdir -p /etc/postgresql/14/main/conf.d/
11+
sudo -u postgres mkdir -p /etc/postgresql/16/main/conf.d/
1312
echo "
1413
work_mem = 250MB
15-
" | sudo -u postgres tee /etc/postgresql/14/main/conf.d/wikipedia.conf
14+
" | sudo -u postgres tee /etc/postgresql/16/main/conf.d/wikipedia.conf
1615

1716
sudo systemctl restart postgresql
1817

19-
20-
2118
sudo apt-get install -y wget coreutils nodejs jq moreutils pigz
2219
sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
2320

steps/latest_available_data.sh

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/bin/bash
22

33
#
4-
# Prints a YYYYMMDD date of the latest available date on
5-
# https://mirror.clarkson.edu/wikimedia/enwiki/
4+
# Prints a YYYYMMDD date of the latest available date on
5+
# https://wikidata.aerotechnet.com/enwiki/
66
# We do some additional checks if the dumps are complete, too
77
#
88

@@ -12,7 +12,6 @@ debug() {
1212
echo -n ''
1313
}
1414

15-
1615
DATE=''
1716

1817
# Sets $DATE to first of the month (YYYYMMDD). If given a parameter then
@@ -21,13 +20,12 @@ set_date_to_first_of_month() {
2120
MINUS_NUM_MONTHS=${1:-0}
2221

2322
if [[ "$(uname)" == "Darwin" ]]; then
24-
DATE=$(date -v -${MINUS_NUM_MONTHS}m +%Y%m01)
23+
DATE=$(date -v -${MINUS_NUM_MONTHS}m +%Y%m01)
2524
else
26-
DATE=$(date --date="-$MINUS_NUM_MONTHS month" +%Y%m01)
25+
DATE=$(date --date="-$MINUS_NUM_MONTHS month" +%Y%m01)
2726
fi
2827
}
2928

30-
3129
check_all_files_ready() {
3230
CHECK_DATE=$1
3331
debug "check_all_files_ready for $CHECK_DATE"
@@ -55,14 +53,13 @@ check_all_files_ready() {
5553

5654
ANY_FILE_MISSING=0
5755

58-
5956
##
6057
## 1. Chinese (ZH) Wikipedia
6158
## usually the last to be dumped
6259
##
6360
# from wikipedia_download.sh
6461
WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect"
65-
DUMP_RUN_INFO_URL="https://mirror.clarkson.edu/wikimedia/zhwiki/$CHECK_DATE/dumpruninfo.json"
62+
DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/zhwiki/$CHECK_DATE/dumpruninfo.json"
6663
debug $DUMP_RUN_INFO_URL
6764
DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")
6865

@@ -71,7 +68,6 @@ check_all_files_ready() {
7168
return 1
7269
fi
7370

74-
7571
for FN in $WIKIPEDIA_REQUIRED_FILES; do
7672
TABLENAME=${FN//_/}table # redirect => redirecttable
7773
debug "checking status for table $TABLENAME"
@@ -85,15 +81,13 @@ check_all_files_ready() {
8581
fi
8682
done
8783

88-
89-
9084
##
9185
## 2. Wikidata
9286
##
9387
# from wikidata_download.sh
9488
WIKIDATA_REQUIRED_FILES="geo_tags page wb_items_per_site"
9589

96-
DUMP_RUN_INFO_URL="https://mirror.clarkson.edu/wikimedia/wikidatawiki/$CHECK_DATE/dumpruninfo.json"
90+
DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/wikidatawiki/$CHECK_DATE/dumpruninfo.json"
9791
debug $DUMP_RUN_INFO_URL
9892
DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")
9993

@@ -118,17 +112,15 @@ check_all_files_ready() {
118112
return $ANY_FILE_MISSING
119113
}
120114

121-
122-
123115
#
124116
# Usually you might try to get a list of dates from
125-
# https://mirror.clarkson.edu/wikimedia/enwiki/ and then sort them, then look at status.html
117+
# https://wikidata.aerotechnet.com/enwiki/ and then sort them, then look at status.html
126118
# inside the directories.
127119
#
128120
# We want to avoid parsing HTML.
129121
#
130122
# Previous version of this script then looked at index.json
131-
# (https://mirror.clarkson.edu/wikimedia/index.json) but the file is written at beginning
123+
# (https://wikidata.aerotechnet.com/index.json) but the file is written at beginning
132124
# of the export so first of month it would list files that don't exist yet.
133125
#
134126

steps/wikidata_download.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,10 @@ echo "====================================================================="
88
: ${BUILDID:=latest}
99
# List of mirrors https://dumps.wikimedia.org/mirrors.html
1010
# Download using main dumps.wikimedia.org: 60 minutes, mirror: 20 minutes
11-
: ${WIKIMEDIA_HOST:=mirror.clarkson.edu/wikimedia}
12-
# See list on https://mirror.clarkson.edu/wikimedia/wikidatawiki/
11+
: ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com}
12+
# See list on https://wikidata.aerotechnet.com/wikidatawiki/
1313
: ${WIKIDATA_DATE:=20220701}
1414

15-
1615
DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
1716
mkdir -p $DOWNLOADED_PATH
1817

@@ -33,12 +32,12 @@ download() {
3332

3433
for FN in geo_tags.sql.gz page.sql.gz wb_items_per_site.sql.gz; do
3534

36-
# https://mirror.clarkson.edu/wikimedia/wikidatawiki/20220620/wikidatawiki-20220620-geo_tags.sql.gz
37-
# https://mirror.clarkson.edu/wikimedia/wikidatawiki/20220620/md5sums-wikidatawiki-20220620-geo_tags.sql.gz.txt
38-
download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/wikidatawiki-$WIKIDATA_DATE-$FN "$DOWNLOADED_PATH/$FN"
35+
# https://wikidata.aerotechnet.com/wikidatawiki/20250501/wikidatawiki-20250501-geo_tags.sql.gz
36+
# https://wikidata.aerotechnet.com/wikidatawiki/20250501/md5sums-wikidatawiki-20250501-geo_tags.sql.gz.txt
37+
download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/wikidatawiki-$WIKIDATA_DATE-$FN "$DOWNLOADED_PATH/$FN"
3938
download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/md5sums-wikidatawiki-$WIKIDATA_DATE-$FN.txt "$DOWNLOADED_PATH/$FN.md5"
4039

41-
EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$FN.md5" | cut -d\ -f1)
40+
EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$FN.md5" | cut -d\ -f1)
4241
CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$FN" | cut -d\ -f1)
4342

4443
if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then

steps/wikipedia_download.sh

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,12 @@ echo "====================================================================="
1111
LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
1212
# List of mirrors https://dumps.wikimedia.org/mirrors.html
1313
# Download using main dumps.wikimedia.org: 150 minutes, mirror: 40 minutes
14-
: ${WIKIMEDIA_HOST:=mirror.clarkson.edu/wikimedia}
15-
# See list on https://mirror.clarkson.edu/wikimedia/enwiki/
14+
: ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com}
15+
# See list on https://wikidata.aerotechnet.com/enwiki/
1616
: ${WIKIPEDIA_DATE:=20220620}
1717

18-
1918
DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia"
2019

21-
2220
download() {
2321
echo "Downloading $1 > $2"
2422
if [ -e "$2" ]; then
@@ -35,8 +33,7 @@ download() {
3533
du -h "$2" | cut -f1
3634
}
3735

38-
for LANG in "${LANGUAGES_ARRAY[@]}"
39-
do
36+
for LANG in "${LANGUAGES_ARRAY[@]}"; do
4037
echo "Language: $LANG"
4138

4239
mkdir -p "$DOWNLOADED_PATH/$LANG"
@@ -55,13 +52,12 @@ do
5552
# 62M downloaded/tr/linktarget.sql.gz
5653
# 4.2M downloaded/tr/redirect.sql.gz
5754

58-
5955
for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do
6056

61-
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN"
57+
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN"
6258
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5"
6359

64-
EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$LANG/$FN.md5" | cut -d\ -f1)
60+
EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$LANG/$FN.md5" | cut -d\ -f1)
6561
CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$LANG/$FN" | cut -d\ -f1)
6662

6763
if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then

0 commit comments

Comments
 (0)