ScheduleExtractor/schedule_extractor.py at main · ScalableHCI/ScheduleExtractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

def fetch_and_clean_html(url):
    try:
        # Fetch the HTML content from the URL
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the main table
        table = soup.find('table')
        if not table:
            raise ValueError("Table not found in the provided URL.")

        # Remove the top and bottom rows (headers, footers, etc.)
        rows = table.find_all('tr')
        if rows:
            rows[0].extract()  # Remove first row
        if len(rows) > 1:
            rows[-1].extract()  # Remove last row

        # Remove the leftmost column which often holds row numbers
        for row in table.find_all('tr'):
            cells = row.find_all(['td', 'th'])
            if cells:
                cells[0].extract()

        # (Optional) Remove any existing inline style attributes from cells
        for cell in table.find_all(['td', 'th']):
            if cell.has_attr('style'):
                del cell['style']

        # Make the *new* first row bold by turning its <td> cells into <th> cells
        new_rows = table.find_all('tr')
        if new_rows:  # If there's at least one row left
            first_row_cells = new_rows[0].find_all('td')
            for cell in first_row_cells:
                cell.name = 'th'  # Changing <td> to <th> makes them bold by default

                # Check for Saturday or Sunday and make them non-bold
                cell_text = cell.get_text().lower()
                if 'saturday' in cell_text or 'sunday' in cell_text:
                    cell['style'] = 'font-weight: normal;'

        # Add light grey background to all non-empty content cells (td)
        for cell in table.find_all('td'):
            if cell.get_text(strip=True):
                # Use a very light grey (lighter than typical grey headers)
                cell['style'] = 'background-color: #f9f9f9;'

        # ----------------------------------------------------------------------
        #  Clean up links of the form: https://www.google.com/url?q=<REAL_URL>
        # ----------------------------------------------------------------------
        for link in table.find_all('a'):
            href = link.get('href', '')
            if href.startswith("https://www.google.com/url?q="):
                # Extract actual URL from 'q' parameter
                parsed = urlparse(href)
                qs = parse_qs(parsed.query)
                real_url = qs.get('q', [''])[0]  # get first item or empty string
                if real_url:
                    # Replace the href with the real URL
                    link['href'] = real_url

        # Create a style block to enforce uniform table cells
        # and also enlarge <th> cells
        style_block = """
        <style>
            table {
                border: 1px solid #000;
                border-collapse: collapse;
                table-layout: fixed; /* Enforces your specified column widths */
            }
            th, td {
                width: 110px;       /* Adjust for most columns */
                height: 35px;
                text-align: left;
                vertical-align: top;
                overflow: hidden;
            }
            th:first-child, td:first-child {
                width: 20px;       /* smaller width for leftmost column */
            }
            /* Make all <th> cells bigger (you can tweak the values below) */
            th {
                font-size: 13px;   /* Increase the font size */
                height: 20px;
            }
            .waffle tr {
                background-color: #fff !important;
            }
        </style>
        """

        # Wrap cleaned table in a div for embedding + include the style block
        embedded_html = f"""
        <div class="embedded-table" style="font-size: 10px;">
            {style_block}
            {str(table)}
        </div>
        """

        # Return the final HTML code to be embedded
        # Note: We do NOT use prettify() here because it inserts whitespace/newlines
        # inside inline tags (like <span> or <a>), which browsers render as extra spaces,
        # breaking the layout for underlined text.
        return embedded_html

    except requests.RequestException as e:
        print(f"An error occurred while fetching the URL: {e}")
    except ValueError as e:
        print(e)

# Specify the Google Spreadsheet URL
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTnx18GETNacxTfJJyL8tU5w8_6zeiCmxoT0vcHrAgjbd6FEzTfmnORq_pl_f4-wUMWVawYa09IO_Et/pubhtml/sheet?headers=false&gid=85355365"

# Get the cleaned HTML code to embed
html_code = fetch_and_clean_html(url)

# Print the result
if html_code:
    print(html_code)