-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.js
More file actions
155 lines (134 loc) · 5.54 KB
/
extract.js
File metadata and controls
155 lines (134 loc) · 5.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Pure extraction + formatting helpers. No DOM dependency beyond
// `extractLegsFromDom`, which is called with a DOM subtree.
//
// Google Flights' booking page renders each flight leg inside a
// `.c257Jb` container, with children that concatenate several text
// fragments without spaces in the rendered innerText, e.g.
// "Etihad EY 146EconomyAirbus A321neoEY 146"
// The parsers below are tolerant of both the space-less (innerText)
// and space-separated (textContent) forms.
// Alternation is ordered longest-first so "Premium Economy" wins over "Economy".
// `\b` is not used because "146Economy" has no word boundary (digit→letter is
// a word-char transition in JS regex).
const CABIN_CLASS_RE = /(Premium Economy|Premium economy|Business|Economy|First)/
const TIME_RE = /\d{1,2}:\d{2}\s?(?:AM|PM)/i
export function parseAirlineAndFlight(text) {
const cabinMatch = text.match(CABIN_CLASS_RE)
if (!cabinMatch) throw new Error(`No cabin class in: ${text}`)
const cabinClass = cabinMatch[1]
const before = text.slice(0, cabinMatch.index).trim()
const after = text
.slice(cabinMatch.index + cabinMatch[0].length)
.trim()
const flightNumMatch = before.match(/([A-Z0-9]{1,3}\s?\d{1,4})\s*$/)
if (!flightNumMatch) throw new Error(`No flight number before cabin: ${before}`)
const flightNumber = flightNumMatch[1].replace(/\s+/g, " ")
const airline = before.slice(0, flightNumMatch.index).trim()
// Strip the trailing repeat of the flight number. We build the pattern from
// the already-parsed flight number so digits in the designator (e.g. "6E 1234")
// cannot be confused with trailing aircraft digits (e.g. "A350EY 800").
const flightNumPattern = flightNumber.replace(/\s+/g, "\\s?")
const aircraft = after.replace(new RegExp(`\\s*${flightNumPattern}\\s*$`), "").trim()
return { airline, flightNumber, cabinClass, aircraft }
}
function parseTimeAndDate(text, { allowDayOffset }) {
// Text forms seen:
// "10:45 AM10:45 AM on Tue, May 12Geneva Airport (GVA)"
// "12:35 PM+112:35 PM on Wed, May 13Narita International Airport (NRT)"
// and the same with spaces inserted between fragments.
const timeMatch = text.match(TIME_RE)
if (!timeMatch) throw new Error(`No time in: ${text}`)
const time = timeMatch[0].toUpperCase().replace(/\s+/g, " ")
let dayOffset = 0
if (allowDayOffset) {
// The offset "+N" is followed by the time repeating ("+112:35 PM" ⇒ offset=1,
// time=12:35 PM). Use a non-greedy quantifier with a lookahead for the next
// HH:MM so the offset can't eat into the hour digits.
const offsetMatch = text.match(/\+(\d{1,2}?)(?=\s*\d{1,2}:\d{2})/)
if (offsetMatch) dayOffset = parseInt(offsetMatch[1], 10)
}
const dateMatch = text.match(/on\s+(\w+),\s+(\w+)\s+(\d{1,2})/)
if (!dateMatch) throw new Error(`No date in: ${text}`)
const [, dayOfWeek, month, dayOfMonth] = dateMatch
const iataMatch = text.match(/\(([A-Z]{3})\)\s*$/)
if (!iataMatch) throw new Error(`No IATA code in: ${text}`)
const iata = iataMatch[1]
// Airport name is the text between the end of the date fragment and " (IATA)"
const airportStart = dateMatch.index + dateMatch[0].length
const airportEnd = text.length - iataMatch[0].length
const airport = text.slice(airportStart, airportEnd).trim()
const result = {
time,
dayOfWeek,
month,
dayOfMonth: parseInt(dayOfMonth, 10),
airport,
iata,
}
if (allowDayOffset) result.dayOffset = dayOffset
return result
}
export function parseDeparture(text) {
return parseTimeAndDate(text, { allowDayOffset: false })
}
export function parseArrival(text) {
return parseTimeAndDate(text, { allowDayOffset: true })
}
export function parseDuration(text) {
const m = text.match(/Travel time:\s*(\d+\s*hr(?:\s*\d+\s*min)?|\d+\s*min)/)
if (!m) throw new Error(`Could not parse duration: ${text}`)
return {
duration: m[1].replace(/\s+/g, " ").trim(),
overnight: /Overnight/.test(text),
}
}
export function parseLegStrings({ mx, dep, arr, dur }) {
return {
...parseAirlineAndFlight(mx),
departure: parseDeparture(dep),
arrival: parseArrival(arr),
...parseDuration(dur),
}
}
function formatLegLines(leg, { bold, linebreak }) {
const openB = bold ? "<b>" : "*"
const closeB = bold ? "</b>" : "*"
const d = leg.departure
const a = leg.arrival
return [
`${openB}${leg.airline} ${leg.flightNumber}${closeB} (dur: ${leg.duration})`,
`Departure ${d.airport} (${d.iata}) ${d.dayOfMonth} ${d.month} ${d.time}`,
`Arrival ${a.airport} (${a.iata}) ${a.dayOfMonth} ${a.month} ${a.time}`,
].join(linebreak)
}
export function formatTripText(legs) {
return legs
.map((leg) => formatLegLines(leg, { bold: false, linebreak: "\r\n" }))
.join("\r\n\r\n")
}
export function formatTripHtml(legs) {
return legs
.map((leg) => formatLegLines(leg, { bold: true, linebreak: "<br/>" }))
.join("<br/><br/>")
}
const LEG_CONTAINER_SELECTOR = ".c257Jb"
const MX_SELECTOR = ".MX5RWe"
const DEP_SELECTOR = ".dPzsIb"
const ARR_SELECTOR = ".SWFQlc"
const DUR_SELECTOR = ".CQYfx"
function readText(el) {
if (!el) return ""
const raw = typeof el.innerText === "string" ? el.innerText : el.textContent
return (raw || "").replace(/\s+/g, " ").trim()
}
export function extractLegsFromDom(root) {
const legs = root.querySelectorAll(LEG_CONTAINER_SELECTOR)
return Array.from(legs).map((leg) =>
parseLegStrings({
mx: readText(leg.querySelector(MX_SELECTOR)),
dep: readText(leg.querySelector(DEP_SELECTOR)),
arr: readText(leg.querySelector(ARR_SELECTOR)),
dur: readText(leg.querySelector(DUR_SELECTOR)),
})
)
}