From 5f50674a3d834c306834068bb0706ceac57ace1f Mon Sep 17 00:00:00 2001 From: Vinzenz Schroeter Date: Mon, 15 Sep 2025 20:57:48 +0200 Subject: [PATCH] keeping scrapers out --- content/posts/keeping-scrapers-out.md | 225 ++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 content/posts/keeping-scrapers-out.md diff --git a/content/posts/keeping-scrapers-out.md b/content/posts/keeping-scrapers-out.md new file mode 100644 index 0000000..557afea --- /dev/null +++ b/content/posts/keeping-scrapers-out.md @@ -0,0 +1,225 @@ ++++ +date = '2025-09-15T20:00:00+02:00' +title = 'Keeping scrapers out' +tags = ['nix', 'nginx', 'anubis'] ++++ + +After reading an [LWN article on high AI scraper traffic](https://lwn.net/Articles/1008897/), I also wanted to do something to "protect" the blog. +I do not see any continuous load peaks, but I dislike the idea of giant corporations profiting off of my writing without any negotiation. + +One solution that stands out currently is [anubis](https://anubis.techaro.lol/). +It tries to challenge requests in a way that is hard or resource intensive for scrapers at scale, but easy and resource-light for normal users. + +I could not find a good summary on how to proxy an nginx virtual host through anubis, so here we go. + + + +### Starting configuration + +The following examples are based on the configuration of this blog. +Most things not relevant here are removed, but you can take a look at the [original](https://git.berlin.ccc.de/vinzenz/nixos-configuration/src/commit/5f5c7267dc8c734eca2de87b5c0168523c9fa3b3/hosts/hetzner-vpn2/nginx.nix). + +```nix +{ pkgs, ... }: +{ + services.nginx = { + enable = true; + recommendedProxySettings = true; + recommendedTlsSettings = true; + recommendedGzipSettings = true; + recommendedOptimisation = true; + + virtualHosts = { + "zerforschen.plus" = { + addSSL = true; + enableACME = true; + root = pkgs.zerforschen-plus-content; + }; + }; + }; +} +``` + +### Hide real host + +The first thing to do is hiding the real website. +Otherwise, a scraper can just use whatever we rename it to. + +I wanted to forward everything through unix domain socket. +The kind I'll use behave like a file, which means we do not have to make sure that e.g. the application only listens on localhost. +It also makes permission management trivial, but we'll see that later. + +```nix +{ pkgs, ... }: +let + # /run/nginx already exists + blog-domain-socket = "/run/nginx/blog.sock"; +in +{ + services.nginx = { + enable = true; + virtualHosts = { + "zerforschen.plus" = { + addSSL = true; + enableACME = true; + # we do not have anything hosted here anymore + }; + # a new virtual host contains the site now + "blog-in-anubis" = { + root = pkgs.zerforschen-plus-content; + # specifying any listen overrides the defaults + listen = [ + { + # this makes nginx create the unix domain socket + addr = "unix:" + blog-domain-socket; + } + ]; + }; + }; + }; +} +``` + +### Create anubis instance + +Next, we need an anubis service that forwards to the socket created by nginx. + +```nix +{ pkgs, ... }: +let + blog-domain-socket = "/run/nginx/blog.sock"; +in +{ + services = { + # previous nginx config + + anubis.instances.main = { + enable = true; + settings = { + TARGET = "unix://" + blog-domain-socket; + }; + }; + }; +} +``` + +### Forward public host to anubis + +Now we have an anubis instance running, but no way to access it from outside. + +I did not want to expose anubis to the public internet directly, so I configured another unix domain socket for this purpose. + +```nix +{ pkgs, ... }: +let + blog-domain-socket = "/run/nginx/blog.sock"; + anubis-domain-socket = "/run/anubis/anubis-blog.sock"; +in +{ + services = { + nginx = { + enable = true; + virtualHosts = { + "zerforschen.plus" = { + addSSL = true; + enableACME = true; + locations."/" = { + # HTTP over unix domain socket + proxyPass = "http://unix:" + anubis-domain-socket; + }; + }; + # "blog-in-anubis" = { ... }; + }; + }; + + anubis.instances.main = { + enable = true; + settings = { + # this makes anubis create and listen to the socket + BIND = anubis-domain-socket; + TARGET = "unix://" + blog-domain-socket; + }; + }; + }; +} + +``` + +### Socket permissions + +The config is nearly complete now. +Applying the configuration now produces permission errors when trying to access the site, because the sockets are owned by the service user and grou with a 660 permission. +In my case, I simply added the services to each others groups. +If this was a server I was making my living with, I'd probably create a new group and folder and add both services to that one. + +```nix +{ pkgs, ... }: +{ + systemd.services = { + nginx.serviceConfig.SupplementaryGroups = [ "anubis" ]; + anubis-main.serviceConfig.SupplementaryGroups = [ "nginx" ]; + }; +} +``` + +### Final configuration + +Overall, I think NixOS makes the configuration required easy to read and write. +The [NixOS options search](https://search.nixos.org/options) is a great storage of knowledge for any software configurable through the NixOS configuration. +If the [default policy](https://github.com/TecharoHQ/anubis/blob/f745d37d9006c3431bf3d435c61565250ab53a3e/data/botPolicies.yaml) does not fit your needs, that would be another step required[^1]. + +My [final configuration](https://git.berlin.ccc.de/vinzenz/nixos-configuration/src/commit/7a17930dd4f225cda4047f1df7d650249c91f29b/nixosConfigurations/hetzner-vpn2/nginx.nix#) for nginx and anubis was as follows: + +```nix +{ pkgs, ... }: +let + blog-domain-socket = "/run/nginx/blog.sock"; + anubis-domain-socket = "/run/anubis/anubis-blog.sock"; +in +{ + systemd.services = { + nginx.serviceConfig.SupplementaryGroups = [ "anubis" ]; + anubis-main.serviceConfig.SupplementaryGroups = [ "nginx" ]; + }; + + services = { + nginx = { + enable = true; + + recommendedProxySettings = true; + recommendedTlsSettings = true; + recommendedGzipSettings = true; + recommendedOptimisation = true; + + virtualHosts = { + "zerforschen.plus" = { + addSSL = true; + enableACME = true; + locations."/" = { + proxyPass = "http://unix:" + anubis-domain-socket; + }; + }; + + "blog-in-anubis" = { + root = pkgs.zerforschen-plus-content; + listen = [ + { + addr = "unix:" + blog-domain-socket; + } + ]; + }; + }; + }; + + anubis.instances.main = { + enable = true; + settings = { + BIND = anubis-domain-socket; + TARGET = "unix://" + blog-domain-socket; + }; + }; + }; +} +``` + +[^1]: Part of the reason I am writing this article is to check if my RSS reader can handle it.