-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschedule_extractor.py
More file actions
executable file
·127 lines (109 loc) · 4.84 KB
/
schedule_extractor.py
File metadata and controls
executable file
·127 lines (109 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
def fetch_and_clean_html(url):
try:
# Fetch the HTML content from the URL
response = requests.get(url)
response.raise_for_status() # Check for request errors
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Extract the main table
table = soup.find('table')
if not table:
raise ValueError("Table not found in the provided URL.")
# Remove the top and bottom rows (headers, footers, etc.)
rows = table.find_all('tr')
if rows:
rows[0].extract() # Remove first row
if len(rows) > 1:
rows[-1].extract() # Remove last row
# Remove the leftmost column which often holds row numbers
for row in table.find_all('tr'):
cells = row.find_all(['td', 'th'])
if cells:
cells[0].extract()
# (Optional) Remove any existing inline style attributes from cells
for cell in table.find_all(['td', 'th']):
if cell.has_attr('style'):
del cell['style']
# Make the *new* first row bold by turning its <td> cells into <th> cells
new_rows = table.find_all('tr')
if new_rows: # If there's at least one row left
first_row_cells = new_rows[0].find_all('td')
for cell in first_row_cells:
cell.name = 'th' # Changing <td> to <th> makes them bold by default
# Check for Saturday or Sunday and make them non-bold
cell_text = cell.get_text().lower()
if 'saturday' in cell_text or 'sunday' in cell_text:
cell['style'] = 'font-weight: normal;'
# Add light grey background to all non-empty content cells (td)
for cell in table.find_all('td'):
if cell.get_text(strip=True):
# Use a very light grey (lighter than typical grey headers)
cell['style'] = 'background-color: #f9f9f9;'
# ----------------------------------------------------------------------
# Clean up links of the form: https://www.google.com/url?q=<REAL_URL>
# ----------------------------------------------------------------------
for link in table.find_all('a'):
href = link.get('href', '')
if href.startswith("https://www.google.com/url?q="):
# Extract actual URL from 'q' parameter
parsed = urlparse(href)
qs = parse_qs(parsed.query)
real_url = qs.get('q', [''])[0] # get first item or empty string
if real_url:
# Replace the href with the real URL
link['href'] = real_url
# Create a style block to enforce uniform table cells
# and also enlarge <th> cells
style_block = """
<style>
table {
border: 1px solid #000;
border-collapse: collapse;
table-layout: fixed; /* Enforces your specified column widths */
}
th, td {
width: 110px; /* Adjust for most columns */
height: 35px;
text-align: left;
vertical-align: top;
overflow: hidden;
}
th:first-child, td:first-child {
width: 20px; /* smaller width for leftmost column */
}
/* Make all <th> cells bigger (you can tweak the values below) */
th {
font-size: 13px; /* Increase the font size */
height: 20px;
}
.waffle tr {
background-color: #fff !important;
}
</style>
"""
# Wrap cleaned table in a div for embedding + include the style block
embedded_html = f"""
<div class="embedded-table" style="font-size: 10px;">
{style_block}
{str(table)}
</div>
"""
# Return the final HTML code to be embedded
# Note: We do NOT use prettify() here because it inserts whitespace/newlines
# inside inline tags (like <span> or <a>), which browsers render as extra spaces,
# breaking the layout for underlined text.
return embedded_html
except requests.RequestException as e:
print(f"An error occurred while fetching the URL: {e}")
except ValueError as e:
print(e)
# Specify the Google Spreadsheet URL
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTnx18GETNacxTfJJyL8tU5w8_6zeiCmxoT0vcHrAgjbd6FEzTfmnORq_pl_f4-wUMWVawYa09IO_Et/pubhtml/sheet?headers=false&gid=85355365"
# Get the cleaned HTML code to embed
html_code = fetch_and_clean_html(url)
# Print the result
if html_code:
print(html_code)