Dublin-Rent-Tracker/get_listings.sh at master · AlbinTouma/Dublin-Rent-Tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash

# I want to collect listings from daft on a daily basis and update the json file. I don't want duplicates in the data.
# I need the script to run and collect data from all pages if there's no file and only fetch the first page if there's data already.
# Page one is updated a few times per hour so there's always going to be one or more listings on it. When all_exist returns true is therefore likely on page 2 or 3.


set -euo pipefail
echo "Running scraper"

OUTPUT_FILE="output.json"
TEMP_FILE=(mktemp)

echo "[]" > "$TEMP_FILE"

# Page starts at 1 else 404
PAGE=1

get_daft(){
  # get_daft curls the page and returns the json array []
  local page="$1"

  curl -s "https://www.daft.ie/sharing/dublin?page=${page}" \
    -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \
    -H 'accept-language: en-GB,en;q=0.9,en-US;q=0.8,sv;q=0.7,ro;q=0.6' \
    -H 'cache-control: no-cache' \
    -H 'pragma: no-cache' \
    -H 'priority: u=0, i' \
    -H 'sec-ch-ua: "Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"' \
    -H 'sec-ch-ua-mobile: ?0' \
    -H 'sec-ch-ua-platform: "Linux"' \
    -H 'sec-fetch-dest: document' \
    -H 'sec-fetch-mode: navigate' \
    -H 'sec-fetch-site: same-origin' \
    -H 'sec-fetch-user: ?1' \
    -H 'upgrade-insecure-requests: 1' \
    -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36' \
    |grep -Pzo '(?s)(?<=<script id="__NEXT_DATA__" type="application/json" crossorigin="anonymous">).*?(?=</script>)'  |
    jq '[
    .props.pageProps.listings[].listing | delpaths([
    ["image"],
    ["prs"],
    ["media"],
    ["seller", "profileImage"],
    ["seller", "profileRoundedImage"],
    ["seller", "squareLogo"],
    ["seller", "standardLogo"],
    ["pageBranding"]
    ])
    ]'

}

check_id(){
  #Opens output.json and checks if id present. If id new it saves the listing to temp.
  local new_listings="$1"
  local existing_ids="$2"
  local TEMP_FILE="$3"
  local all_exist=true

  while read -r listing; do
    id=$(echo "$listing" | jq -r '.id')

    if grep -Fxq "$id" <<< "$existing_ids"; then
      echo "id exists $id"
    else
      echo "$listing" | jq '{title: .title, id: .id}'
      jq -c ". + [$listing]" "$TEMP_FILE" > "$TEMP_FILE.tmp"
      mv "$TEMP_FILE.tmp" "$TEMP_FILE"
      all_exist=false
    fi

  done < <(echo "$new_listings" | jq -c '.[]')

  $all_exist && return 0 || return 1

}

while true;
do
  echo "Scraping page $PAGE"

  new_listings="$(get_daft "$PAGE")"

  if [ -z "$new_listings" ] || [ "$new_listings" = "[]" ]; then
    echo "No listings found or end reached."
    break
  fi

  existing_ids=$(jq -r '.[].id' "$OUTPUT_FILE" | sort -u)

  if check_id "$new_listings" "$existing_ids" "$TEMP_FILE"; then
    echo "All IDs on page already exist. Stopping early".
    break
  else
    echo "New listings on $PAGE"
  fi


  ((PAGE++))


done

echo "Merging data and writing to output.json"
jq -s 'add | unique_by(.id)' "$OUTPUT_FILE" "$TEMP_FILE" > "$OUTPUT_FILE.tmp"
mv "$OUTPUT_FILE.tmp" "$OUTPUT_FILE"
echo "Scraping completed"

sqlite-utils insert daft.db sharing output.json --pk=id --ignore --flatten --alter


git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@useres.noreply.github.com"
git add "$OUTPUT_FILE"
git commit -m "Daily daft listings update" || echo "No changes"
git push origin master