Skip to content

Commit 72619d8

Browse files
committed
Enhance web scraping functionality with error handling and fix data extraction logic
1 parent 0164f3f commit 72619d8

1 file changed

Lines changed: 27 additions & 21 deletions

File tree

episodes/a-real-website.md

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,17 @@ from tqdm import tqdm
5656

5757
# Getting the HTML from our desired URL as a text string
5858
url = 'https://carpentries.org/workshops/upcoming-workshops/'
59-
req = requests.get(url).text
59+
req = requests.get(url)
6060

61-
# Cleaning and printing the string
62-
cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()
63-
print(cleaned_req[0:1000])
61+
# Checking if the request was successful
62+
if req.status_code == 200:
63+
req = req.text
64+
65+
# Cleaning and printing the string
66+
cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()
67+
print(cleaned_req[0:1000])
68+
else:
69+
print(f"Failed to retrieve the webpage. Status code: {req.status_code}")
6470
```
6571

6672
```output
@@ -114,7 +120,7 @@ soup = BeautifulSoup(cleaned_req, 'html.parser')
114120
# Finding all third-level headers and doing a formatted print
115121
h3_by_tag = soup.find_all('h3')
116122
print("Number of h3 elements found: ", len(h3_by_tag))
117-
for n, h3 in enumerate(h3_by_tag):
123+
for n, h3 in enumerate(h3_by_tag, start=1):
118124
print(f"Workshop #{n} - {h3.get_text()}")
119125
```
120126

@@ -159,18 +165,18 @@ print(div_firsth3.prettify())
159165

160166
Remember, the output shown here is probably different than yours, as the website is continuously updated.
161167
```output
162-
<div class="p-8 mb-5 border" data-country="Puerto Rico" data-curriculum="Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)" data-meeting="In Person" data-program="Software Carpentry">
168+
<div class="p-8 mb-5 border" data-country="United States" data-curriculum="Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)" data-meeting="In Person" data-program="Library Carpentry">
163169
<div class="flex mb-4 -mx-2">
164170
<div class="flex items-center mx-2">
165-
<img alt="" class="mx-1" src="/software.svg"/>
171+
<img alt="" class="mx-1" src="/library.svg"/>
166172
<span class="text-[0.625rem] uppercase">
167-
Software Carpentry
173+
Library Carpentry
168174
</span>
169175
</div>
170176
<div class="flex items-center mx-2">
171-
<img alt="" class="mr-1" height="20" src="/flags/pr.png" width="20"/>
177+
<img alt="" class="mr-1" height="20" src="/flags/us.png" width="20"/>
172178
<span class="text-[0.625rem] uppercase">
173-
Puerto Rico
179+
United States
174180
</span>
175181
</div>
176182
<div class="flex items-center mx-2">
@@ -181,20 +187,20 @@ Remember, the output shown here is probably different than yours, as the website
181187
</div>
182188
</div>
183189
<h3 class="title text-base md:text-[1.75rem] leading-[2.125rem] font-semibold">
184-
<a class="underline hover:text-blue-hover text-gray-dark" href="https://dept-ccom-uprrp.github.io/2025-06-04-uprrp-r/">
185-
University of Puerto Rico
190+
<a class="underline hover:text-blue-hover text-gray-dark" href="https://unt-carpentries.github.io/2026-01-22-unt/">
191+
University of North Texas
186192
</a>
187193
</h3>
188194
<div class="mb-5 text-lg font-semibold text-gray-mid">
189-
Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)
195+
Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)
190196
</div>
191197
<div class="mb-2 text-xs">
192198
<strong class="font-bold">
193199
Instructors
194200
</strong>
195201
:
196202
<span class="instructors">
197-
Humberto Ortiz-Zuazaga, Airined Montes Mercado
203+
Sarah Lynn Fisher, Maristella Feustle, Whitney Johnson-Freeman
198204
</span>
199205
</div>
200206
<div class="mb-4 text-xs">
@@ -203,11 +209,11 @@ Remember, the output shown here is probably different than yours, as the website
203209
</strong>
204210
:
205211
<span class="helpers">
206-
Isabel Rivera, Diana Buitrago Escobar, Yabdiel Ramos Valerio
212+
Marcia McIntosh, Trey Clark
207213
</span>
208214
</div>
209215
<div class="text-sm font-semibold text-gray-mid">
210-
Jun 04 - Jun 10 2025
216+
Jan 22 - Jan 22 2026
211217
</div>
212218
</div>
213219
```
@@ -246,11 +252,11 @@ workshop_list = []
246252
for item in divs:
247253
dict_workshop = {}
248254
dict_workshop['host'] = item.find('h3').get_text()
249-
dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href')
250-
dict_workshop['curriculum'] = div_firsth3.get('data-curriculum')
251-
dict_workshop['country'] = div_firsth3.get('data-country')
252-
dict_workshop['format'] = div_firsth3.get('data-meeting')
253-
dict_workshop['program'] = div_firsth3.get('data-program')
255+
dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary
256+
dict_workshop['curriculum'] = item.get('data-curriculum')
257+
dict_workshop['country'] = item.get('data-country')
258+
dict_workshop['format'] = item.get('data-meeting')
259+
dict_workshop['program'] = item.get('data-program')
254260
workshop_list.append(dict_workshop)
255261

256262
# Transform list into a DataFrame

0 commit comments

Comments
 (0)