diff --git a/html/robots.txt b/html/robots.txt index 52f6d45a..e92e503a 100644 --- a/html/robots.txt +++ b/html/robots.txt @@ -1,6 +1,16 @@ -User-agent: SiteimproveBot -Disallow: /versions -Disallow: /old_site - +# General bots +# Note: Google does not respect Crawl-delay, but that's fine. User-agent: * -Allow: / +Allow: /project/*/users/* +Disallow: /project/ +Disallow: /api/ +Crawl-delay: 10 + +# AI crawlers — slower, same path rules duplicated +User-agent: GPTBot +User-agent: ClaudeBot +User-agent: PerplexityBot +Allow: /project/*/users/* +Disallow: /project/ +Disallow: /api/ +Crawl-delay: 30 diff --git a/models/projects.lua b/models/projects.lua index 7f9aa476..986d469d 100644 --- a/models/projects.lua +++ b/models/projects.lua @@ -117,7 +117,7 @@ local ActiveProjects = Model:extend('active_projects', { '#present:Username=' .. escape(self.username) .. '&ProjectName=' .. escape(self.projectname) .. '&editMode&noRun', - download = '/project/' .. escape(self.id), + download = '/api/v1/project/' .. escape(self.id), site = '/project?username=' .. escape(self.username) .. '&projectname=' .. escape(self.projectname), author = '/user?username=' .. escape(self.username), diff --git a/nginx.conf b/nginx.conf index 1b6f60a4..d751fdb1 100644 --- a/nginx.conf +++ b/nginx.conf @@ -46,6 +46,7 @@ http { include nginx.conf.d/logging.conf; include nginx.conf.d/mime.types; + include nginx.conf.d/snap-bot-mitigation.conf; resolver ${{DNS_RESOLVER}}; diff --git a/nginx.conf.d/locations.conf b/nginx.conf.d/locations.conf index 2b2998c6..468349f1 100644 --- a/nginx.conf.d/locations.conf +++ b/nginx.conf.d/locations.conf @@ -1,6 +1,10 @@ # Shared location configurations for all environments # These are included in both the main SSL server block (prod) and non-SSL server block (dev). +# Bot mitigation (robots.txt + UA blocklist). Included here so it is applied +# to every server block — both prod hosts, both staging hosts, and dev. +include nginx.conf.d/snap-bot-mitigation-server.conf; + # Specify the cloud domain each page should use. set_by_lua_block $cloud_url { return os.getenv('CLOUD_URL') } @@ -48,6 +52,19 @@ location @lapisapp { } } +# Raw project XML at /project/ — Lapis-served and the largest +# crawler egress source, so it gets rate-limited via the snap_project zone +# (defined in snap-bot-mitigation.conf). The upcoming HTML route +# /project//users/ does not match this regex and is unaffected. +location ~ ^/project/[0-9]+/?$ { + limit_req zone=snap_project burst=120 nodelay; + access_log logs/lapis_access.log main_ext if=$should_log; + default_type text/html; + content_by_lua_block { + require("lapis").serve("app") + } +} + # Static content from snapCloud # Avoid unnecessarily logging data for CSS, JS, etc. location /static/ { diff --git a/nginx.conf.d/snap-bot-mitigation-server.conf b/nginx.conf.d/snap-bot-mitigation-server.conf new file mode 100644 index 00000000..163d475a --- /dev/null +++ b/nginx.conf.d/snap-bot-mitigation-server.conf @@ -0,0 +1,18 @@ +# Bot-mitigation directives that live in server{} context. Included from +# locations.conf so every server block (prod + staging, both hosts, plus +# dev) picks them up. The http-context map and limit_req_zone live in +# nginx.conf.d/snap-bot-mitigation.conf. + +# Serve robots.txt as a static file from the repo. Host-independent — does +# not depend on the per-host static root or the Lapis app. +location = /robots.txt { + access_log off; + default_type text/plain; + root html; +} + +# 403 any user-agent flagged by $snap_block_ua. `return` is one of the few +# directives that is safe to use inside `if`. +if ($snap_block_ua) { + return 403; +} diff --git a/nginx.conf.d/snap-bot-mitigation.conf b/nginx.conf.d/snap-bot-mitigation.conf new file mode 100644 index 00000000..51329a0d --- /dev/null +++ b/nginx.conf.d/snap-bot-mitigation.conf @@ -0,0 +1,24 @@ +# Bot-mitigation directives that must live in the http{} context. The +# matching server-context directives are in snap-bot-mitigation-server.conf, +# included from locations.conf. + +# UA blocklist for SEO crawlers that ignore robots.txt. \b word-boundaries +# guard against substring false positives (e.g. "yeti" inside a longer UA). +map $http_user_agent $snap_block_ua { + default 0; + "~*ahrefsbot" 1; + "~*semrushbot" 1; + "~*\bdotbot\b" 1; + "~*mj12bot" 1; + "~*sleepbot" 1; + "~*\byeti\b" 1; + "~*blexbot" 1; + "~*petalbot" 1; +} + +# Rate-limit zone for the raw XML at /project/. The rate is +# deliberately generous because Snap! is used in classrooms where 20–40 +# students share one NAT IP; tighten only with evidence from error.log +# ("limiting requests" lines). +limit_req_zone $binary_remote_addr zone=snap_project:10m rate=60r/s; +limit_req_status 429; diff --git a/views/partials/flag_list.etlua b/views/partials/flag_list.etlua index 86b9924c..a5c1083a 100644 --- a/views/partials/flag_list.etlua +++ b/views/partials/flag_list.etlua @@ -43,7 +43,7 @@ 'Are you sure you want to remove this flag?', () => { cloud.delete( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { flagger: flagger } ); @@ -56,7 +56,7 @@ 'report the flagger for abusing the flagging system?', () => { cloud.delete( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { flagger: flagger, report: true } ); diff --git a/views/partials/project_buttons.etlua b/views/partials/project_buttons.etlua index c407de74..ae5eb508 100644 --- a/views/partials/project_buttons.etlua +++ b/views/partials/project_buttons.etlua @@ -10,14 +10,14 @@ aria-label="<%= locale.get('unbookmark') %>" title="<%= locale.get('unbookmark') %>" onclick="cloud.delete( - '/project/<%= project.id %>/bookmark/<%= current_user.id %>')" + '/api/v1/project/<%= project.id %>/bookmark/<%= current_user.id %>')" > <% else %> <% end %> <% end %> @@ -28,7 +28,8 @@ %> <%= locale.get('download') %> @@ -128,7 +129,7 @@ function confirmDelete () { '<%- package.loaded.dialog( 'confirm_delete', { item_name = 'project'}) %>', - () => { cloud.delete('/project/<%= project.id %>'); } + () => { cloud.delete('/api/v1/project/<%= project.id %>'); } ); } @@ -143,7 +144,7 @@ function confirmFlag () { var form = document.querySelector('form.reasons'); cloud.post( - '/project/<%= project.id %>/flag', + '/api/v1/project/<%= project.id %>/flag', null, { reason: form.querySelector( @@ -161,7 +162,7 @@ function confirmFlag () { } function unflagProject() { - cloud.delete('/project/<%= project.id %>/flag') + cloud.delete('/api/v1/project/<%= project.id %>/flag') } function markAsRemix () { @@ -170,7 +171,7 @@ function markAsRemix () { input => { var url = new URL(input); cloud.post( - '/project/<%= project.id %>/mark_as_remix', + '/api/v1/project/<%= project.id %>/mark_as_remix', null, { username: '<%= project.username %>', diff --git a/views/partials/project_details.etlua b/views/partials/project_details.etlua index a3e8e5d1..db0f3cfb 100644 --- a/views/partials/project_details.etlua +++ b/views/partials/project_details.etlua @@ -42,34 +42,34 @@ <% elseif project.ispublic then %> <% else %> <% end %> <% end %> diff --git a/views/static/resources.lua b/views/static/resources.lua index 17fe4620..f515b624 100644 --- a/views/static/resources.lua +++ b/views/static/resources.lua @@ -40,7 +40,7 @@ local materials = { { title = "Get Coding with Snap!", author = 'openSAP', - url = "https://open.sap.com/courses/snap1", + url = "https://learning.sap.com/learning-journeys/get-coding-with-snap-building-up-to-ai", language = {"English"}, type = "course", level = 'beginner', @@ -51,7 +51,7 @@ local materials = { { title = "From Media Computation to Data Science", author = 'openSAP', - url = "https://open.sap.com/courses/snap2", + url = "https://learning.sap.com/courses/from-media-computation-to-data-science", language = {"English"}, type = "course", level = nil, @@ -70,7 +70,7 @@ local materials = { description = nil, image = nil }, - { + { title = "BJC Sparks", author = 'UC Berkeley and EDC', url = "https://bjc.berkeley.edu/sparks/",