add screenshot
This commit is contained in:
parent
5f50674a3d
commit
d794fafc25
2 changed files with 3 additions and 0 deletions
BIN
content/posts/keeping-scrapers-out/chat-screenshot.png
Normal file
BIN
content/posts/keeping-scrapers-out/chat-screenshot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
228
content/posts/keeping-scrapers-out/index.md
Normal file
228
content/posts/keeping-scrapers-out/index.md
Normal file
|
@ -0,0 +1,228 @@
|
|||
+++
|
||||
date = '2025-09-15T20:00:00+02:00'
|
||||
title = 'Keeping scrapers out'
|
||||
tags = ['nix', 'nginx', 'anubis']
|
||||
+++
|
||||
|
||||
After reading an [LWN article on high AI scraper traffic](https://lwn.net/Articles/1008897/), I also wanted to do something to "protect" the blog.
|
||||
I do not see any continuous load peaks, but I dislike the idea of giant corporations profiting off of my writing without any negotiation.
|
||||
|
||||
One solution that stands out currently is [anubis](https://anubis.techaro.lol/).
|
||||
It tries to challenge requests in a way that is hard or resource intensive for scrapers at scale, but easy and resource-light for normal users.
|
||||
|
||||
I could not find a good summary on how to proxy an nginx virtual host through anubis, so here we go.
|
||||
|
||||
<!-- more -->
|
||||
|
||||
### Starting configuration
|
||||
|
||||
The following examples are based on the configuration of this blog.
|
||||
Most things not relevant here are removed, but you can take a look at the [original](https://git.berlin.ccc.de/vinzenz/nixos-configuration/src/commit/5f5c7267dc8c734eca2de87b5c0168523c9fa3b3/hosts/hetzner-vpn2/nginx.nix).
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
services.nginx = {
|
||||
enable = true;
|
||||
recommendedProxySettings = true;
|
||||
recommendedTlsSettings = true;
|
||||
recommendedGzipSettings = true;
|
||||
recommendedOptimisation = true;
|
||||
|
||||
virtualHosts = {
|
||||
"zerforschen.plus" = {
|
||||
addSSL = true;
|
||||
enableACME = true;
|
||||
root = pkgs.zerforschen-plus-content;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Hide real host
|
||||
|
||||
The first thing to do is hiding the real website.
|
||||
Otherwise, a scraper can just use whatever we rename it to.
|
||||
|
||||
I wanted to forward everything through unix domain socket.
|
||||
The kind I'll use behave like a file, which means we do not have to make sure that e.g. the application only listens on localhost.
|
||||
It also makes permission management trivial, but we'll see that later.
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
# /run/nginx already exists
|
||||
blog-domain-socket = "/run/nginx/blog.sock";
|
||||
in
|
||||
{
|
||||
services.nginx = {
|
||||
enable = true;
|
||||
virtualHosts = {
|
||||
"zerforschen.plus" = {
|
||||
addSSL = true;
|
||||
enableACME = true;
|
||||
# we do not have anything hosted here anymore
|
||||
};
|
||||
# a new virtual host contains the site now
|
||||
"blog-in-anubis" = {
|
||||
root = pkgs.zerforschen-plus-content;
|
||||
# specifying any listen overrides the defaults
|
||||
listen = [
|
||||
{
|
||||
# this makes nginx create the unix domain socket
|
||||
addr = "unix:" + blog-domain-socket;
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Create anubis instance
|
||||
|
||||
Next, we need an anubis service that forwards to the socket created by nginx.
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
blog-domain-socket = "/run/nginx/blog.sock";
|
||||
in
|
||||
{
|
||||
services = {
|
||||
# previous nginx config
|
||||
|
||||
anubis.instances.main = {
|
||||
enable = true;
|
||||
settings = {
|
||||
TARGET = "unix://" + blog-domain-socket;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Forward public host to anubis
|
||||
|
||||
Now we have an anubis instance running, but no way to access it from outside.
|
||||
|
||||
I did not want to expose anubis to the public internet directly, so I configured another unix domain socket for this purpose.
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
blog-domain-socket = "/run/nginx/blog.sock";
|
||||
anubis-domain-socket = "/run/anubis/anubis-blog.sock";
|
||||
in
|
||||
{
|
||||
services = {
|
||||
nginx = {
|
||||
enable = true;
|
||||
virtualHosts = {
|
||||
"zerforschen.plus" = {
|
||||
addSSL = true;
|
||||
enableACME = true;
|
||||
locations."/" = {
|
||||
# HTTP over unix domain socket
|
||||
proxyPass = "http://unix:" + anubis-domain-socket;
|
||||
};
|
||||
};
|
||||
# "blog-in-anubis" = { ... };
|
||||
};
|
||||
};
|
||||
|
||||
anubis.instances.main = {
|
||||
enable = true;
|
||||
settings = {
|
||||
# this makes anubis create and listen to the socket
|
||||
BIND = anubis-domain-socket;
|
||||
TARGET = "unix://" + blog-domain-socket;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Socket permissions
|
||||
|
||||
The config is nearly complete now.
|
||||
Applying the configuration now produces permission errors when trying to access the site, because the sockets are owned by the service user and grou with a 660 permission.
|
||||
In my case, I simply added the services to each others groups.
|
||||
If this was a server I was making my living with, I'd probably create a new group and folder and add both services to that one.
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
systemd.services = {
|
||||
nginx.serviceConfig.SupplementaryGroups = [ "anubis" ];
|
||||
anubis-main.serviceConfig.SupplementaryGroups = [ "nginx" ];
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Final configuration
|
||||
|
||||
Overall, I think NixOS makes the configuration required easy to read and write.
|
||||
The [NixOS options search](https://search.nixos.org/options) is a great storage of knowledge for any software configurable through the NixOS configuration.
|
||||
If the [default policy](https://github.com/TecharoHQ/anubis/blob/f745d37d9006c3431bf3d435c61565250ab53a3e/data/botPolicies.yaml) does not fit your needs, that would be another step required[^1].
|
||||
|
||||
Update: Asking ChatGPT about this article indicates everything is working as intended:
|
||||

|
||||
|
||||
My [final configuration](https://git.berlin.ccc.de/vinzenz/nixos-configuration/src/commit/7a17930dd4f225cda4047f1df7d650249c91f29b/nixosConfigurations/hetzner-vpn2/nginx.nix#) for nginx and anubis was as follows:
|
||||
|
||||
```nix
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
blog-domain-socket = "/run/nginx/blog.sock";
|
||||
anubis-domain-socket = "/run/anubis/anubis-blog.sock";
|
||||
in
|
||||
{
|
||||
systemd.services = {
|
||||
nginx.serviceConfig.SupplementaryGroups = [ "anubis" ];
|
||||
anubis-main.serviceConfig.SupplementaryGroups = [ "nginx" ];
|
||||
};
|
||||
|
||||
services = {
|
||||
nginx = {
|
||||
enable = true;
|
||||
|
||||
recommendedProxySettings = true;
|
||||
recommendedTlsSettings = true;
|
||||
recommendedGzipSettings = true;
|
||||
recommendedOptimisation = true;
|
||||
|
||||
virtualHosts = {
|
||||
"zerforschen.plus" = {
|
||||
addSSL = true;
|
||||
enableACME = true;
|
||||
locations."/" = {
|
||||
proxyPass = "http://unix:" + anubis-domain-socket;
|
||||
};
|
||||
};
|
||||
|
||||
"blog-in-anubis" = {
|
||||
root = pkgs.zerforschen-plus-content;
|
||||
listen = [
|
||||
{
|
||||
addr = "unix:" + blog-domain-socket;
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
anubis.instances.main = {
|
||||
enable = true;
|
||||
settings = {
|
||||
BIND = anubis-domain-socket;
|
||||
TARGET = "unix://" + blog-domain-socket;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
[^1]: Part of the reason I am writing this article is to check if my RSS reader can handle it.
|
Loading…
Add table
Add a link
Reference in a new issue