Skip to content

Commit d4e6880

Browse files
committed
improve crawl process
1 parent 9c6b0f9 commit d4e6880

File tree

4 files changed

+243
-140
lines changed

4 files changed

+243
-140
lines changed

client/api/divar.ts

Lines changed: 187 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { toEnglishDigits } from "@/utils/format";
2+
import { randomBetween } from "@/utils/number";
23
import { ofetch } from "ofetch";
34

45
const CITY = '1'; // Tehran city ID
@@ -14,21 +15,22 @@ const httpClient = ofetch.create({
1415
},
1516
})
1617

18+
export type ProgressFunction<T> = (value: number, max: number, title: string, lastValue: T) => void;
19+
1720
export type House = {
18-
location: { lat: number, lng: number } | null,
19-
size: number | null,
20-
beds: number | null,
21-
totalPrice: number | null,
22-
unitPrice: number | null,
23-
elevator: boolean | null,
24-
storage: boolean | null,
25-
parking: boolean | null,
26-
balcony: boolean | null,
27-
yearBuilt: number | null,
28-
}
29-
export const getHouse = (id: string): Promise<House> => {
21+
token: string,
22+
location: {
23+
lat: number,
24+
lng: number,
25+
exact: boolean,
26+
},
27+
size: number,
28+
price: number, // per meter
29+
};
30+
31+
export const getHouse = (token: string): Promise<House> => {
3032
// eslint-disable-next-line @typescript-eslint/no-explicit-any
31-
return httpClient(`/v8/posts-v2/web/${id}`).then((resp: any) => {
33+
return httpClient(`/v8/posts-v2/web/${token}`).then((resp: any) => {
3234
// eslint-disable-next-line @typescript-eslint/no-explicit-any
3335
let fields: Record<string, any> = {};
3436

@@ -53,19 +55,12 @@ export const getHouse = (id: string): Promise<House> => {
5355
}
5456
}
5557
}
56-
58+
if (typeof fields.location?.latitude !== 'number' || typeof fields.location?.longitude !== 'number' || typeof fields['متراژ'] !== 'string' || fields['متراژ'] === '' || typeof fields['قیمت هر متر'] !== 'string' || fields['قیمت هر متر'].trim() === '') throw new Error('cannot parse');
5759
return {
58-
location: typeof fields.location?.latitude === 'number' && typeof fields.location?.longitude === 'number' ? { lat: fields.location?.latitude, lng: fields.location.longitude } : null,
59-
size: typeof fields['متراژ'] === 'string' && fields['متراژ'].trim() !== '' ? +fields['متراژ'] : null,
60-
beds: typeof fields['اتاق'] === 'string' && fields['اتاق'].trim() !== '' ? +fields['اتاق'] : null,
61-
// floor: typeof fields['طبقه'] === 'string' && fields['طبقه'].trim() !== '' ? fields['طبقه'].split(' ').filter(x=>!!x).map(x => +x) as [number, number] | [number] : null,
62-
totalPrice: typeof fields['قیمت کل'] === 'string' && fields['قیمت کل'].trim() !== '' ? +fields['قیمت کل'] : null,
63-
unitPrice: typeof fields['قیمت هر متر'] === 'string' && fields['قیمت هر متر'].trim() !== '' ? +fields['قیمت هر متر'] : null,
64-
elevator: 'آسانسور ندارد' in fields || fields['آسانسور'] === false ? false : fields['آسانسور'] === true ? true : null,
65-
storage: 'انباری ندارد' in fields || fields['انباری'] === false ? false : fields['انباری'] === true ? true : null,
66-
parking: 'پارکینگ ندارد' in fields || fields['پارکینگ'] === false ? false : fields['پارکینگ'] === true ? true : null,
67-
balcony: 'بالکن ندارد' in fields || fields['بالکن'] === false ? false : fields['بالکن'] === true ? true : null,
68-
yearBuilt: typeof fields['ساخت'] === 'string' && fields['ساخت'] !== '' ? +fields['ساخت'] : null,
60+
token,
61+
location: { lat: fields.location?.latitude, lng: fields.location.longitude, exact: true },
62+
size: +fields['متراژ'],
63+
price: +fields['قیمت هر متر'],
6964
};
7065
});
7166
}
@@ -76,135 +71,218 @@ export type District = {
7671
value: string,
7772
hint: string,
7873
keywords: string[],
74+
cityId: string,
75+
boundingBox: {
76+
minLat: number,
77+
maxLat: number,
78+
minLng: number,
79+
maxLng: number,
80+
},
7981
}
8082

81-
export const getDistricts = (): Promise<District[]> => {
82-
return httpClient(
83+
export const getDistricts = async (cityId: string, progressFn?: ProgressFunction<District[]>): Promise<District[]> => {
84+
let returnValue: District[] = [];
85+
const progress = (value: number, max: number, title: string) => {
86+
progressFn?.(value, max, title, returnValue);
87+
}
88+
89+
progress(0, 1, 'Fetching Districts');
90+
91+
const districtsWithoutBbox: Omit<District, 'boundingBox'>[] = await httpClient(
8392
"/v8/postlist/w/filters",
8493
{
8594
method: 'POST',
95+
cache: 'force-cache',
8696
body: {
87-
city_ids: [CITY],
97+
city_ids: [cityId],
8898
source_view: "FILTER",
8999
data: {},
90100
},
91101
}
92-
)
93102
// eslint-disable-next-line @typescript-eslint/no-explicit-any
94-
.then((resp: any) => {
95-
let returnValue: District[] = [];
103+
).then((resp: any) => {
104+
let ret: Omit<District, 'boundingBox'>[] = [];
96105
for (const widget of (resp?.page?.widget_list ?? [])) {
97106
for (const subWidget of (widget?.data?.widget_list ?? [])) {
98107
for (const district of (subWidget?.data?.neighborhoods?.options ?? [])) {
99-
returnValue = [
100-
...returnValue,
108+
ret = [
109+
...ret,
101110
{
102111
title: district.title as string,
103112
value: district.value as string,
104113
hint: district.hint as string,
105-
keywords: district.search_keywords.split('،').map((x: string) => x.trim())
114+
keywords: district.search_keywords.split('،').map((x: string) => x.trim()),
115+
cityId,
106116
},
107117
];
108118
}
109119
}
110120
}
111-
return returnValue;
121+
return ret;
112122
});
123+
124+
let fetchedDistricts = 0;
125+
for (const districtWithoutBbox of districtsWithoutBbox) {
126+
progress(fetchedDistricts + 1, districtsWithoutBbox.length + 1, 'Fetching Districts Approximate Bounding Box');
127+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
128+
const searchResult = await httpClient<any>("/v8/postlist/w/search", {
129+
method: 'POST',
130+
cache: 'force-cache',
131+
body: {
132+
city_ids: [cityId],
133+
source_view: "FILTER",
134+
search_data: {
135+
form_data: {
136+
data: {
137+
category: { str: { value: "apartment-sell" } },
138+
districts: { repeated_string: { value: [districtWithoutBbox.value] } },
139+
}
140+
}
141+
},
142+
},
143+
});
144+
const bbox = searchResult?.map_data?.state?.camera_info?.bbox;
145+
if (!bbox || typeof bbox !== 'object' || typeof bbox.min_latitude !== 'number'|| typeof bbox.max_latitude !== 'number'|| typeof bbox.min_longitude !== 'number'|| typeof bbox.max_longitude !== 'number') throw new Error('unexpected error');
146+
returnValue = [
147+
...returnValue,
148+
{
149+
boundingBox: {
150+
minLat: bbox.min_latitude!,
151+
maxLat: bbox.max_latitude!,
152+
minLng: bbox.min_longitude!,
153+
maxLng: bbox.max_longitude!,
154+
},
155+
...districtWithoutBbox,
156+
}
157+
];
158+
fetchedDistricts+=1;
159+
}
160+
161+
return returnValue;
162+
113163
}
114164

115-
export type SearchHousesFilters = {
116-
district?: string[];
165+
export type GetDistrictHousesFilters = {
166+
district: District;
117167
elevator?: boolean;
118168
parking?: boolean;
119169
balcony?: boolean;
120170
size?: [number, number];
121171
price?: [number, number];
122172
}
123173

124-
export const getHousesIds = (filters: SearchHousesFilters): Promise<string[]> => {
125-
return httpClient("/v8/postlist/w/search", {
174+
export const getDistrictHouses = async (filters: GetDistrictHousesFilters): Promise<House[]> => {
175+
let returnValue: House[] = [];
176+
177+
const apiFilters = {
178+
category: { str: { value: "apartment-sell" } },
179+
districts: { repeated_string: { value: filters.district.value } },
180+
...(typeof filters.elevator !== 'undefined' && {
181+
elevator: { boolean: { value: filters.elevator } },
182+
}),
183+
...(typeof filters.parking !== 'undefined' && {
184+
parking: { boolean: { value: filters.parking } },
185+
}),
186+
...(typeof filters.balcony !== 'undefined' && {
187+
balcony: { boolean: { value: filters.balcony } },
188+
}),
189+
...(typeof filters.size !== 'undefined' && {
190+
size: {number_range: { minimum: filters.size[0], maximum: filters.size[1] }},
191+
}),
192+
...(typeof filters.price !== 'undefined' && {
193+
price: {number_range: { minimum: filters.price[0], maximum: filters.price[1] }},
194+
}),
195+
};
196+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
197+
const searchResult = await httpClient<any>("/v8/postlist/w/search", {
126198
method: 'POST',
127199
body: {
128200
city_ids: [CITY],
129201
source_view: 'FILTER',
130-
disable_recommendation: false,
202+
disable_recommendation: true,
131203
search_data: {
132204
form_data: {
133-
data: {
134-
// bbox: {
135-
// repeated_float: {
136-
// value: [
137-
// { value: 51.4265289 },
138-
// { value: 35.7938423 },
139-
// { value: 51.4346771 },
140-
// { value: 35.8068733 },
141-
// ],
142-
// },
143-
// },
144-
deed_type: { repeated_string: { value: ['single_page'] } },
145-
category: { str: { value: "apartment-sell" } },
146-
...(typeof filters.district !== 'undefined' && {
147-
districts: { repeated_string: { value: filters.district } },
148-
}),
149-
...(typeof filters.elevator !== 'undefined' && {
150-
elevator: { boolean: { value: filters.elevator } },
151-
}),
152-
...(typeof filters.parking !== 'undefined' && {
153-
parking: { boolean: { value: filters.parking } },
154-
}),
155-
...(typeof filters.balcony !== 'undefined' && {
156-
balcony: { boolean: { value: filters.balcony } },
157-
}),
158-
...(typeof filters.size !== 'undefined' && {
159-
size: {number_range: { minimum: filters.size[0], maximum: filters.size[1] }},
160-
}),
161-
...(typeof filters.price !== 'undefined' && {
162-
price: {number_range: { minimum: filters.price[0], maximum: filters.price[1] }},
163-
}),
164-
},
205+
data: apiFilters,
165206
},
166207
},
167208
},
168-
})
169-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
170-
.then((resp: any) => {
171-
let returnValue: string[] = [];
172-
for (const widget of (resp?.list_widgets ?? [])) {
173-
if (widget?.widget_type !== 'POST_ROW' || !widget?.data?.middle_description_text?.includes('تومان') || !widget?.data?.token) continue;
174-
returnValue = [...returnValue, widget.data.token];
175-
}
176-
return returnValue;
177-
});
209+
});
210+
const widgets = (searchResult?.list_widgets ?? []);
211+
212+
for (const widget of widgets) {
213+
if (widget?.widget_type !== 'POST_ROW') continue;
214+
const token = widget?.data?.token;
215+
const title = widget?.data?.title; // "اپارتمان 105 متری موقعیت برای خانه اولی ها"
216+
const description = widget?.data?.middle_description_text; // "۴۹۰,۰۰۰,۰۰۰ تومان"
217+
218+
if (!token || typeof token !== 'string' || !title || typeof title !== 'string' || !title.includes('متر') || !description || typeof description !== 'string' || !description.includes('تومان')) continue;
219+
220+
const size = +toEnglishDigits(title).replace(/.*?(\d+(?:[.,]\d+)?)\s*متر.*/s, "$1");
221+
if (isNaN(size)) continue;
222+
223+
const totalPrice = +toEnglishDigits(description).replace(/[^\d\s]/g, '');
224+
if (isNaN(totalPrice)) continue;
225+
226+
returnValue = [...returnValue, {
227+
token,
228+
size,
229+
price: totalPrice / size,
230+
location: {
231+
lat: randomBetween(filters.district.boundingBox.minLat, filters.district.boundingBox.maxLat),
232+
lng: randomBetween(filters.district.boundingBox.minLng, filters.district.boundingBox.maxLng),
233+
exact: false,
234+
},
235+
}];
236+
}
237+
return returnValue;
178238
}
179239

240+
export type GetAllHousesFilters = Omit<GetDistrictHousesFilters, 'district'> & {
241+
cityId: string,
242+
};
180243

181-
export const getHouses = async (filters: SearchHousesFilters, progressFn?: (value: number, lastValue: House[]) => void): Promise<House[]> => {
182-
let returnValue: House[] = [];
183-
let currentProgress = 0;
184-
const progress = (value: number) => {
185-
currentProgress = value;
186-
progressFn?.(currentProgress, returnValue);
187-
}
188-
progress(0);
189-
const districts = await getDistricts();
190-
191-
let passedDistricts = 0;
192-
for (const district of districts) {
193-
progress(passedDistricts / districts.length);
194-
const districtHouseIds = await getHousesIds({
195-
...filters,
196-
district: [district.value],
197-
});
198-
199-
200-
for (const houseId of districtHouseIds) {
201-
await new Promise((resolve) => setTimeout(resolve, 1000));
202-
returnValue = [...returnValue, await getHouse(houseId)];
203-
progress((passedDistricts + (districtHouseIds.indexOf(houseId) / districtHouseIds.length)) / districts.length);
204-
}
244+
export const getAllCityHouses = async (filters: GetAllHousesFilters, progressFn?: ProgressFunction<House[]>): Promise<House[]> => {
245+
let returnValue: House[] = [];
246+
const progress = (value: number, max: number, title: string) => {
247+
progressFn?.(value, max, title, returnValue);
248+
}
249+
progress(0, 1, '');
250+
251+
// Step 1: prepare
252+
const districts = await getDistricts(filters.cityId, (a, b, t) => {
253+
progress((a / b), 3, t);
254+
});
255+
256+
// Step 2: fetch
257+
let passedDistricts = 0;
258+
for (const district of districts) {
259+
progress((passedDistricts / districts.length) + 1, 3, 'Fetching Districts Houses');
260+
const districtHouses = await getDistrictHouses({
261+
...filters,
262+
district,
263+
});
264+
returnValue = [...returnValue, ...districtHouses];
265+
266+
passedDistricts+=1;
267+
}
205268

206-
passedDistricts += 1;
269+
// Step 3: verify
270+
let passedHouses = 0;
271+
const approximateHouses = [...returnValue];
272+
for (const approximateHouse of approximateHouses) {
273+
progress((passedHouses / approximateHouses.length) + 2, 3, 'Verifying Houses Data');
274+
try {
275+
const validatedHouse = await getHouse(approximateHouse.token);
276+
returnValue = [
277+
...returnValue.filter(x => x !== approximateHouse),
278+
validatedHouse,
279+
];
280+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
281+
} catch (_e) {
282+
continue;
207283
}
208-
progress(1);
209-
return returnValue;
284+
passedHouses += 1;
285+
await new Promise((r) => setTimeout(r, 1100));
286+
}
287+
return returnValue;
210288
}

0 commit comments

Comments
 (0)