From 18321777f66f01187bbdaee8cbad0c9cb9530d7e Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sat, 23 May 2026 06:14:12 +0000 Subject: [PATCH 1/6] feat: implement multi-layer bot mitigation in nginx config - Update robots.txt to disallow /project/ and /api/ for compliant crawlers. - Add a user-agent blocklist to return 403 for aggressive SEO bots. - Implement rate limiting (60r/s) specifically for the /project/:id XML endpoint to reduce egress costs from automated scraping. - Serve robots.txt as a static file directly from the repository. Co-authored-by: Claude Code --- html/robots.txt | 11 +++++---- nginx.conf | 1 + nginx.conf.d/locations.conf | 17 ++++++++++++++ nginx.conf.d/snap-bot-mitigation-server.conf | 18 +++++++++++++++ nginx.conf.d/snap-bot-mitigation.conf | 24 ++++++++++++++++++++ 5 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 nginx.conf.d/snap-bot-mitigation-server.conf create mode 100644 nginx.conf.d/snap-bot-mitigation.conf diff --git a/html/robots.txt b/html/robots.txt index 52f6d45a..56d84069 100644 --- a/html/robots.txt +++ b/html/robots.txt @@ -1,6 +1,7 @@ -User-agent: SiteimproveBot -Disallow: /versions -Disallow: /old_site - User-agent: * -Allow: / +Allow: /project/*/users/* +Disallow: /project/ +Disallow: /api/ +Crawl-delay: 10 + +Sitemap: https://snap.berkeley.edu/sitemap.xml diff --git a/nginx.conf b/nginx.conf index 2b7f4c16..3dfe67ac 100644 --- a/nginx.conf +++ b/nginx.conf @@ -46,6 +46,7 @@ http { include nginx.conf.d/logging.conf; include nginx.conf.d/mime.types; + include nginx.conf.d/snap-bot-mitigation.conf; resolver ${{DNS_RESOLVER}}; diff --git a/nginx.conf.d/locations.conf b/nginx.conf.d/locations.conf index 2b2998c6..468349f1 100644 --- a/nginx.conf.d/locations.conf +++ b/nginx.conf.d/locations.conf @@ -1,6 +1,10 @@ # Shared location configurations for all environments # These are included in both the main SSL server block (prod) and non-SSL server block (dev). +# Bot mitigation (robots.txt + UA blocklist). Included here so it is applied +# to every server block — both prod hosts, both staging hosts, and dev. +include nginx.conf.d/snap-bot-mitigation-server.conf; + # Specify the cloud domain each page should use. set_by_lua_block $cloud_url { return os.getenv('CLOUD_URL') } @@ -48,6 +52,19 @@ location @lapisapp { } } +# Raw project XML at /project/ — Lapis-served and the largest +# crawler egress source, so it gets rate-limited via the snap_project zone +# (defined in snap-bot-mitigation.conf). The upcoming HTML route +# /project//users/ does not match this regex and is unaffected. +location ~ ^/project/[0-9]+/?$ { + limit_req zone=snap_project burst=120 nodelay; + access_log logs/lapis_access.log main_ext if=$should_log; + default_type text/html; + content_by_lua_block { + require("lapis").serve("app") + } +} + # Static content from snapCloud # Avoid unnecessarily logging data for CSS, JS, etc. location /static/ { diff --git a/nginx.conf.d/snap-bot-mitigation-server.conf b/nginx.conf.d/snap-bot-mitigation-server.conf new file mode 100644 index 00000000..163d475a --- /dev/null +++ b/nginx.conf.d/snap-bot-mitigation-server.conf @@ -0,0 +1,18 @@ +# Bot-mitigation directives that live in server{} context. Included from +# locations.conf so every server block (prod + staging, both hosts, plus +# dev) picks them up. The http-context map and limit_req_zone live in +# nginx.conf.d/snap-bot-mitigation.conf. + +# Serve robots.txt as a static file from the repo. Host-independent — does +# not depend on the per-host static root or the Lapis app. +location = /robots.txt { + access_log off; + default_type text/plain; + root html; +} + +# 403 any user-agent flagged by $snap_block_ua. `return` is one of the few +# directives that is safe to use inside `if`. +if ($snap_block_ua) { + return 403; +} diff --git a/nginx.conf.d/snap-bot-mitigation.conf b/nginx.conf.d/snap-bot-mitigation.conf new file mode 100644 index 00000000..51329a0d --- /dev/null +++ b/nginx.conf.d/snap-bot-mitigation.conf @@ -0,0 +1,24 @@ +# Bot-mitigation directives that must live in the http{} context. The +# matching server-context directives are in snap-bot-mitigation-server.conf, +# included from locations.conf. + +# UA blocklist for SEO crawlers that ignore robots.txt. \b word-boundaries +# guard against substring false positives (e.g. "yeti" inside a longer UA). +map $http_user_agent $snap_block_ua { + default 0; + "~*ahrefsbot" 1; + "~*semrushbot" 1; + "~*\bdotbot\b" 1; + "~*mj12bot" 1; + "~*sleepbot" 1; + "~*\byeti\b" 1; + "~*blexbot" 1; + "~*petalbot" 1; +} + +# Rate-limit zone for the raw XML at /project/. The rate is +# deliberately generous because Snap! is used in classrooms where 20–40 +# students share one NAT IP; tighten only with evidence from error.log +# ("limiting requests" lines). +limit_req_zone $binary_remote_addr zone=snap_project:10m rate=60r/s; +limit_req_status 429; From bfa07ef64a4c9692527066a7952e6a374fd5574f Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sat, 23 May 2026 02:53:53 -0700 Subject: [PATCH 2/6] no sitemap --- html/robots.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/html/robots.txt b/html/robots.txt index 56d84069..76ae5bf4 100644 --- a/html/robots.txt +++ b/html/robots.txt @@ -3,5 +3,3 @@ Allow: /project/*/users/* Disallow: /project/ Disallow: /api/ Crawl-delay: 10 - -Sitemap: https://snap.berkeley.edu/sitemap.xml From 7da3d5e2bca3e0dfbad9fbd0f1faea6e8cf52fd5 Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sat, 23 May 2026 02:58:33 -0700 Subject: [PATCH 3/6] stricter robots.txt --- html/robots.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/html/robots.txt b/html/robots.txt index 76ae5bf4..e92e503a 100644 --- a/html/robots.txt +++ b/html/robots.txt @@ -1,5 +1,16 @@ +# General bots +# Note: Google does not respect Crawl-delay, but that's fine. User-agent: * Allow: /project/*/users/* Disallow: /project/ Disallow: /api/ Crawl-delay: 10 + +# AI crawlers — slower, same path rules duplicated +User-agent: GPTBot +User-agent: ClaudeBot +User-agent: PerplexityBot +Allow: /project/*/users/* +Disallow: /project/ +Disallow: /api/ +Crawl-delay: 30 From 180756f12600ad1db41013208af7bd1889f63db0 Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sat, 23 May 2026 10:03:49 +0000 Subject: [PATCH 4/6] refactor: canonicalize project API routes to use /api/v1 prefix Update internal UI and model call sites to use the explicit /api/v1/project/:id path instead of the bare /project/:id route. This ensures legitimate application traffic bypasses bot-mitigation rate limits targeting legacy crawler entry points. Co-authored-by: Claude Code --- models/projects.lua | 2 +- views/partials/flag_list.etlua | 4 ++-- views/partials/project_buttons.etlua | 12 ++++++------ views/partials/project_details.etlua | 12 ++++++------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/models/projects.lua b/models/projects.lua index 7f9aa476..986d469d 100644 --- a/models/projects.lua +++ b/models/projects.lua @@ -117,7 +117,7 @@ local ActiveProjects = Model:extend('active_projects', { '#present:Username=' .. escape(self.username) .. '&ProjectName=' .. escape(self.projectname) .. '&editMode&noRun', - download = '/project/' .. escape(self.id), + download = '/api/v1/project/' .. escape(self.id), site = '/project?username=' .. escape(self.username) .. '&projectname=' .. escape(self.projectname), author = '/user?username=' .. escape(self.username), diff --git a/views/partials/flag_list.etlua b/views/partials/flag_list.etlua index 86b9924c..a5c1083a 100644 --- a/views/partials/flag_list.etlua +++ b/views/partials/flag_list.etlua @@ -43,7 +43,7 @@ 'Are you sure you want to remove this flag?', () => { cloud.delete( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { flagger: flagger } ); @@ -56,7 +56,7 @@ 'report the flagger for abusing the flagging system?', () => { cloud.delete( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { flagger: flagger, report: true } ); diff --git a/views/partials/project_buttons.etlua b/views/partials/project_buttons.etlua index c407de74..3706e63d 100644 --- a/views/partials/project_buttons.etlua +++ b/views/partials/project_buttons.etlua @@ -10,14 +10,14 @@ aria-label="<%= locale.get('unbookmark') %>" title="<%= locale.get('unbookmark') %>" onclick="cloud.delete( - '/project/<%= project.id %>/bookmark/<%= current_user.id %>')" + '/api/v1/project/<%= project.id %>/bookmark/<%= current_user.id %>')" > <% else %> <% end %> <% end %> @@ -128,7 +128,7 @@ function confirmDelete () { '<%- package.loaded.dialog( 'confirm_delete', { item_name = 'project'}) %>', - () => { cloud.delete('/project/<%= project.id %>'); } + () => { cloud.delete('/api/v1/project/<%= project.id %>'); } ); } @@ -143,7 +143,7 @@ function confirmFlag () { var form = document.querySelector('form.reasons'); cloud.post( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { reason: form.querySelector( @@ -161,7 +161,7 @@ function confirmFlag () { } function unflagProject() { - cloud.delete('/project/<%= project.id %>/flag') + cloud.delete('/api/v1/project/<%= project.id %>/flag') } function markAsRemix () { @@ -170,7 +170,7 @@ function markAsRemix () { input => { var url = new URL(input); cloud.post( - '/project/<%= project.id %>/mark_as_remix', + '/api/v1/project/<%= project.id %>/mark_as_remix', null, { username: '<%= project.username %>', diff --git a/views/partials/project_details.etlua b/views/partials/project_details.etlua index a3e8e5d1..db0f3cfb 100644 --- a/views/partials/project_details.etlua +++ b/views/partials/project_details.etlua @@ -42,34 +42,34 @@ <% elseif project.ispublic then %> <% else %> <% end %> <% end %> From 02d10667410daa01f2202ffe741cb635ed2ff879 Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sun, 24 May 2026 12:45:11 -0700 Subject: [PATCH 5/6] Set a filename --- views/partials/project_buttons.etlua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/views/partials/project_buttons.etlua b/views/partials/project_buttons.etlua index 3706e63d..ae5eb508 100644 --- a/views/partials/project_buttons.etlua +++ b/views/partials/project_buttons.etlua @@ -28,7 +28,8 @@ %> <%= locale.get('download') %> From 5f1b9a2e48b2c7664086abeb9d5a4a8f1897ea62 Mon Sep 17 00:00:00 2001 From: Michael Ball Date: Sun, 24 May 2026 13:41:40 -0700 Subject: [PATCH 6/6] Fix openSAP course links --- views/static/resources.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/views/static/resources.lua b/views/static/resources.lua index 17fe4620..f515b624 100644 --- a/views/static/resources.lua +++ b/views/static/resources.lua @@ -40,7 +40,7 @@ local materials = { { title = "Get Coding with Snap!", author = 'openSAP', - url = "https://open.sap.com/courses/snap1", + url = "https://learning.sap.com/learning-journeys/get-coding-with-snap-building-up-to-ai", language = {"English"}, type = "course", level = 'beginner', @@ -51,7 +51,7 @@ local materials = { { title = "From Media Computation to Data Science", author = 'openSAP', - url = "https://open.sap.com/courses/snap2", + url = "https://learning.sap.com/courses/from-media-computation-to-data-science", language = {"English"}, type = "course", level = nil, @@ -70,7 +70,7 @@ local materials = { description = nil, image = nil }, - { + { title = "BJC Sparks", author = 'UC Berkeley and EDC', url = "https://bjc.berkeley.edu/sparks/",