-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
126 lines (99 loc) · 3.64 KB
/
main.py
File metadata and controls
126 lines (99 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import pandas as pd
from datetime import datetime
import time
import knime.scripting.io as knio
# =============================================================================
# CONFIG
# =============================================================================
LINKEDIN_EMAIL = "" # optional
LINKEDIN_PASSWORD = "" # optional
ANALYTICS_URL = "https://www.linkedin.com/analytics/demographic-detail/urn:li:fsd_profile:profile/?metricType=MEMBER_FOLLOWERS"
# =============================================================================
# FUNCTIONS
# =============================================================================
def login_linkedin(page, email=None, password=None):
page.goto("https://www.linkedin.com/login", wait_until="domcontentloaded")
time.sleep(2)
if email and password:
try:
page.fill("#username", email)
page.fill("#password", password)
page.click('button[type="submit"]')
time.sleep(5)
except:
pass
# Manual login if needed
def extract_simple_data(page):
time.sleep(3)
# Smooth scroll
for i in range(5):
page.evaluate(f"window.scrollTo(0, {i * 300})")
time.sleep(0.5)
page.evaluate("window.scrollTo(0, 0)")
time.sleep(2)
data = []
body_text = page.locator("body").inner_text()
lines = body_text.split("\n")
current_section = None
for i, line in enumerate(lines):
line = line.strip()
if line in [
"Cargo", "Ubicación", "Sector",
"Nivel de responsabilidad", "Antigüedad",
"Tamaño de empresa", "Función"
]:
current_section = line
continue
if "%" in line and current_section:
name = None
percentage = None
if i > 0 and "%" not in lines[i - 1]:
name = lines[i - 1].strip()
percentage = line
else:
parts = line.rsplit(" ", 1)
if len(parts) == 2:
name, percentage = parts
if name and percentage:
data.append({
"section": current_section,
"name": name,
"percentage": percentage,
"extraction_date": datetime.now()
})
return data
# =============================================================================
# MAIN
# =============================================================================
rows = []
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
slow_mo=500,
args=["--disable-blink-features=AutomationControlled"]
)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
locale="es-ES"
)
page = context.new_page()
page.set_default_timeout(60000)
login_linkedin(page, LINKEDIN_EMAIL, LINKEDIN_PASSWORD)
try:
page.goto(ANALYTICS_URL, wait_until="domcontentloaded", timeout=30000)
except PlaywrightTimeout:
pass
time.sleep(5)
rows = extract_simple_data(page)
browser.close()
# =============================================================================
# KNIME OUTPUT
# =============================================================================
if rows:
df = pd.DataFrame(rows)
else:
df = pd.DataFrame(
columns=["section", "name", "percentage", "extraction_date"]
)
knio.output_tables[0] = knio.Table.from_pandas(df)