Skip to content

Commit 44eafb8

Browse files
authored
Merge pull request #246 from stickerdaniel/master
fixed handling for when a person has multiple positions under a company
2 parents 56305c6 + 23ee0ec commit 44eafb8

File tree

1 file changed

+131
-74
lines changed

1 file changed

+131
-74
lines changed

linkedin_scraper/person.py

Lines changed: 131 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -116,17 +116,31 @@ def get_experiences(self):
116116
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
117117
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
118118
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
119-
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
119+
120+
# Fix: Handle case where more than 2 elements are returned
121+
elements = position.find_elements(By.XPATH, "*")
122+
if len(elements) < 2:
123+
continue # Skip if we don't have enough elements
124+
125+
company_logo_elem = elements[0]
126+
position_details = elements[1]
120127

121128
# company elem
122-
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
123-
if not company_linkedin_url:
129+
try:
130+
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
131+
if not company_linkedin_url:
132+
continue
133+
except NoSuchElementException:
124134
continue
125135

126136
# position details
127137
position_details_list = position_details.find_elements(By.XPATH,"*")
128138
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
129139
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
140+
141+
if not position_summary_details:
142+
continue
143+
130144
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
131145

132146
if len(outer_positions) == 4:
@@ -147,50 +161,71 @@ def get_experiences(self):
147161
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
148162
else:
149163
position_title = ""
150-
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
151-
work_times = ""
164+
company = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
165+
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else ""
152166
location = ""
153167

154-
155-
times = work_times.split("·")[0].strip() if work_times else ""
156-
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
168+
# Safely extract times and duration
169+
if work_times:
170+
parts = work_times.split("·")
171+
times = parts[0].strip() if parts else ""
172+
duration = parts[1].strip() if len(parts) > 1 else None
173+
else:
174+
times = ""
175+
duration = None
157176

158177
from_date = " ".join(times.split(" ")[:2]) if times else ""
159-
to_date = " ".join(times.split(" ")[3:]) if times else ""
160-
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
161-
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
162-
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
163-
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
178+
to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""
179+
180+
if position_summary_text and any(element.get_attribute("class") == "pvs-list__container" for element in position_summary_text.find_elements(By.XPATH, "*")):
181+
try:
182+
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
183+
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
184+
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
185+
except NoSuchElementException:
186+
inner_positions = []
164187
else:
165188
inner_positions = []
189+
166190
if len(inner_positions) > 1:
167191
descriptions = inner_positions
168192
for description in descriptions:
169-
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
170-
position_title_elem = res[0] if len(res) > 0 else None
171-
work_times_elem = res[1] if len(res) > 1 else None
172-
location_elem = res[2] if len(res) > 2 else None
173-
174-
175-
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
176-
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
177-
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
178-
times = work_times.split("·")[0].strip() if work_times else ""
179-
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
180-
from_date = " ".join(times.split(" ")[:2]) if times else ""
181-
to_date = " ".join(times.split(" ")[3:]) if times else ""
182-
183-
experience = Experience(
184-
position_title=position_title,
185-
from_date=from_date,
186-
to_date=to_date,
187-
duration=duration,
188-
location=location,
189-
description=description,
190-
institution_name=company,
191-
linkedin_url=company_linkedin_url
192-
)
193-
self.add_experience(experience)
193+
try:
194+
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
195+
position_title_elem = res[0] if len(res) > 0 else None
196+
work_times_elem = res[1] if len(res) > 1 else None
197+
location_elem = res[2] if len(res) > 2 else None
198+
199+
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
200+
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
201+
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
202+
203+
# Safely extract times and duration
204+
if work_times:
205+
parts = work_times.split("·")
206+
times = parts[0].strip() if parts else ""
207+
duration = parts[1].strip() if len(parts) > 1 else None
208+
else:
209+
times = ""
210+
duration = None
211+
212+
from_date = " ".join(times.split(" ")[:2]) if times else ""
213+
to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""
214+
215+
experience = Experience(
216+
position_title=position_title,
217+
from_date=from_date,
218+
to_date=to_date,
219+
duration=duration,
220+
location=location,
221+
description=description,
222+
institution_name=company,
223+
linkedin_url=company_linkedin_url
224+
)
225+
self.add_experience(experience)
226+
except (NoSuchElementException, IndexError) as e:
227+
# Skip this description if elements are missing
228+
continue
194229
else:
195230
description = position_summary_text.text if position_summary_text else ""
196231

@@ -215,47 +250,69 @@ def get_educations(self):
215250
self.scroll_to_bottom()
216251
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
217252
for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
218-
position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
219-
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
220-
221-
# company elem
222-
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
223-
224-
# position details
225-
position_details_list = position_details.find_elements(By.XPATH,"*")
226-
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
227-
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
228-
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
229-
230-
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
231-
if len(outer_positions) > 1:
232-
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
233-
else:
234-
degree = None
235-
236-
if len(outer_positions) > 2:
237-
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
253+
try:
254+
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
255+
256+
# Fix: Handle case where more than 2 elements are returned
257+
elements = position.find_elements(By.XPATH,"*")
258+
if len(elements) < 2:
259+
continue # Skip if we don't have enough elements
260+
261+
institution_logo_elem = elements[0]
262+
position_details = elements[1]
263+
264+
# institution elem
265+
try:
266+
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
267+
except NoSuchElementException:
268+
institution_linkedin_url = None
269+
270+
# position details
271+
position_details_list = position_details.find_elements(By.XPATH,"*")
272+
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
273+
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
274+
275+
if not position_summary_details:
276+
continue
277+
278+
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
279+
280+
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
281+
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else None
238282

239-
if times != "":
240-
from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
241-
to_date = times.split(" ")[-1]
242-
else:
243283
from_date = None
244284
to_date = None
285+
286+
if len(outer_positions) > 2:
287+
try:
288+
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
289+
290+
if times and "-" in times:
291+
split_times = times.split(" ")
292+
dash_index = split_times.index("-") if "-" in split_times else -1
293+
294+
if dash_index > 0:
295+
from_date = split_times[dash_index-1]
296+
if dash_index < len(split_times) - 1:
297+
to_date = split_times[-1]
298+
except (NoSuchElementException, ValueError):
299+
from_date = None
300+
to_date = None
245301

302+
description = position_summary_text.text if position_summary_text else ""
246303

247-
248-
description = position_summary_text.text if position_summary_text else ""
249-
250-
education = Education(
251-
from_date=from_date,
252-
to_date=to_date,
253-
description=description,
254-
degree=degree,
255-
institution_name=institution_name,
256-
linkedin_url=institution_linkedin_url
257-
)
258-
self.add_education(education)
304+
education = Education(
305+
from_date=from_date,
306+
to_date=to_date,
307+
description=description,
308+
degree=degree,
309+
institution_name=institution_name,
310+
linkedin_url=institution_linkedin_url
311+
)
312+
self.add_education(education)
313+
except (NoSuchElementException, IndexError) as e:
314+
# Skip this education entry if elements are missing
315+
continue
259316

260317
def get_name_and_location(self):
261318
top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")

0 commit comments

Comments
 (0)