~moviuro/moviuro.bin

moviuro.bin/lie-to-us -rwxr-xr-x 10.6 KiB
dab94f1a — Moviuro lie-to-*, blackhole: add the date(1) when input was fetched in comments 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/bin/sh

set -e

_target="127.0.0.1"
_ignore="use-application-dns.net"
_myname=${0##*/}
_input="bad!https://s3.amazonaws.com/lists.disconnect.me/simple_ad.txt
bad!https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt
bad!https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt
bad!https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt
bad!https://pgl.yoyo.org/adservers/serverlist.php?mimetype=plaintext&hostformat=plain
bad!https://raw.githubusercontent.com/ookangzheng/dbl-oisd-nl/master/dbl2.txt
nsfw!https://raw.githubusercontent.com/ookangzheng/dbl-oisd-nl/master/dbl_nsfw.txt
bad!https://someonewhocares.org/hosts/zero/hosts!0.0.0.0
bad!https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts!0.0.0.0
bad!https://www.github.developerdan.com/hosts/lists/ads-and-tracking-extended.txt!0.0.0.0
nsfw!https://raw.githubusercontent.com/blocklistproject/Lists/master/porn.txt!0.0.0.0
bad!https://raw.githubusercontent.com/blocklistproject/Lists/master/ransomware.txt!0.0.0.0
bad!https://raw.githubusercontent.com/blocklistproject/Lists/master/phishing.txt!0.0.0.0
bad!https://raw.githubusercontent.com/blocklistproject/Lists/master/malware.txt!0.0.0.0
bad!https://raw.githubusercontent.com/blocklistproject/Lists/master/tracking.txt!0.0.0.0
gambling!https://raw.githubusercontent.com/blocklistproject/Lists/master/gambling.txt!0.0.0.0"

__usage () {
  cat << EOF
$_myname [-d] [-o out] [-i "domain [domain [...]]"] [tag!URL[!IP][(^|\\n)tag!URL[!IP]...]]
$_myname -h

Create lying DNS files for unbound, using the power of tags (unbound >= 1.16).
Tags can then be used to define specific resolving rules, see example below.
Lies are tagged:
  o bad (incl. ads, malware and malvertizing)
  o nsfw
  o gambling
Lies never include: TLDs, localhost or public suffixes, (see
https://publicsuffix.org/)

Flags
  o -d: enable debug (set -x)
  o -h: this help text
  o -o out: specify an output file.
    Default: <stdout>
  o -i "domain [domain [...]]": domains to ignore and remove from the output.
    Default: $_ignore

The last argument is an optional list of inputs. Each input file is downloaded
and sanitized and all domains are added to the output with all the correct tags.

To save some disk space and processing by unbound(8), $_myname removes
subdomains of domains that are tagged. As such, if "malware.tld" and
"foo.malware.tld" are tagged as "bad", then "foo.malware.tld" will be removed
from the output because "malware.tld" is a suffix to "foo.malware.tld" (and
unbound(8) being a recursive resolver, it wouldn't care about subdomains such
as "foo.malware.tld").
This will not apply to e.g. "bar.legit.com" and "baz.legit.com" if
"legit.com" is not tagged the same as "bar" and "baz".

The "IP" parameter is optional and used for "ready-to-use" hosts(5) files. E.g.

  bad!https://raw.githubusercontent.com/ookangzheng/dbl-oisd-nl/master/dbl.txt
      ^- this is a domain list, we don't specify an IP

  nsfw!https://raw.githubusercontent.com/ookangzheng/dbl-oisd-nl/master/hosts_nsfw.txt!0.0.0.0
       ^- this is a host file where we disregard 0.0.0.0 to get domains

The caret (^) and newline (\\n) can be used to seperate items.
Its default value is:
  $_input

MAKE SURE YOU ARE ALLOWED TO USE THE LISTS IN YOUR SITUATION

There are also a few domains that $_myname takes care of:
  o localhost: this one is the most bizarre. Some badly written applications
    WILL query our lying DNS resolver for 'localhost' and ignore
    nsswitch.conf(5) (like, e.g. redis)
  o We explicitly reject TLDs. If one of the input files contains a TLD, we will
    ignore it: we intend to block bad domains, not the web in its entirety.

There is also one domain that you (the operator) must take care of:
  o use-application-dns.net: this one is a canary domain for DoH. It is used
    by badly written software (Firefox) to check whether it should use DoH. This
    defeats the ENTIRE FUCKING PURPOSE of DoH ANYWAY because my ISP could very
    well intercept and modify that DNS request (made by the usual means) to
    return NXDOMAIN, thus degrading Firefox to classic DNS queries. WHICH IS THE
    EXACT SITUATION DoH WAS SUPPOSED TO PROTECT USERS FROM.

EXAMPLES
  In unbound.conf(5)

  define-tag: "bad gambling nsfw home_whitelist"
  # sane defaults
  access-control: 0.0.0.0/0 deny

  # 10.28.56.0/24 querying "bad" domains get a specific reply
  #  no specifics for nsfw domains
  #  using different A replies helps identify what went well/wrong
  access-control-tag: 10.28.56.0/24 "bad"
  access-control-tag-data: 10.28.56.0/24 "bad"  "A 127.0.56.1"

  # 10.29.58.0/24 querying "bad or nsfw" domains get a specific reply, but we
  # will answer truthfully for domains with the home_whitelist tag
  access-control-tag: 10.29.58.0/24 "bad nsfw home_whitelist"
  access-control-tag-action: 10.29.58.0/24 "home_whitelist" always_transparent
  access-control-tag-data: 10.29.58.0/24 "bad"  "A 127.0.58.1"
  access-control-tag-data: 10.29.58.0/24 "nsfw" "A 127.0.58.2"

  # break (NXDOMAIN) use-application-dns.net
  local-zone: use-application-dns.net static
  # unbreak laposte.fr, because they are outsourcing core functionality;
  local-zone-tag: cdn.tagcommander.com "home_whitelist"
  local-zone:     cdn.tagcommander.com redirect
  # NB: tagcommander.com ends up with the "bad" tag, but our setup above
  # overrides that for 10.29.58.0/24

  # The generated file is included after the rest
  include: out.$_myname


SEE ALSO
  https://support.mozilla.org/en-US/kb/canary-domain-use-application-dnsnet
  https://news.ycombinator.com/item?id=22414912
  https://publicsuffix.org/
  https://unbound.docs.nlnetlabs.nl/en/latest/topics/filtering/tags-views.html
  unbound.conf(5)
EOF
}

while getopts ":dhi:o:" _opt; do
  case "$_opt" in
    d) set -x                 ;;
    h) __usage; exit 0        ;;
    o) _output_file="$OPTARG" ;;
    i) _ignore="$OPTARG"      ;;
    *) __usage >&2 ; exit 1   ;;
  esac
done

shift "$((OPTIND-1))"

[ -n "$1" ] && _input="$1"

# first choice is curl(1), should be OK for Linux
if command -v curl >/dev/null 2>&1; then
  __download () {
    curl --compressed -sL "$1"
  }
# second is fetch(1), for FreeBSD
elif command -v fetch >/dev/null 2>&1; then
  __download () {
    fetch -q -o - "$1"
  }
# last comes ftp(1), which does far more than its name implies (OpenBSD only)
elif [ "$(uname -s)" = OpenBSD ] && command -v ftp >/dev/null 2>&1; then
  __download () {
    ftp -o - "$1"
  }
else
  printf '%s\n' "Oh no! You don't seem to have curl(1) or fetch(1)" >&2
  printf '%s\n' "I'm cowardly exiting" >&2
  exit 4
fi

# If we're here, we're actually going to do something
_temp_dir="$(mktemp -d -t "$(basename "$0").XXXXXX")"
mkdir "$_temp_dir/tags"
_temp_ack="$_temp_dir/acks"
_ignore_regex="^($(echo "$_ignore" | tr ' ' '|'))$"
_public_suffix="$_temp_dir/tlds"

__cleanup () {
  [ -d "$_temp_dir" ] && rm -fr "$_temp_dir"
}

trap __cleanup INT TERM

# __add_hosts URL IP destination
# put domains (without their final .) from URL in destination file, stripping
# "IP"; this is used for input hosts(5) files
__add_hosts () {
  printf '# Original file at %s (fetched on %s)\n' "$1" "$(date)" >> "$_temp_ack"
  __download "$1" | awk "/^$2 / "'{ print $2 }' | grep -Eo '.*[^\.]' >> "$3"
}

# __add_simple URL destination
# put domains (without their final .) from URL in destination file
# this is used for simple domain lists
__add_simple () {
  printf '# Original file at %s (fetched on %s)\n' "$1" "$(date)" >> "$_temp_ack"
  # We only want the lines that are not comments, we remove the final dot on
  # the line and we don't fail on empty files
  __download "$1" | grep -e '^[^\#]' | grep -Eo '.*[^\.]' >> "$2" || true
}

__create_public_suffix () {
  printf '# The public suffix list was used https://publicsuffix.org\n' >> "$_temp_ack"
  __download "https://publicsuffix.org/list/public_suffix_list.dat" |
   # This file's has a specific syntax: https://publicsuffix.org/list/
   grep -vE '^(//|$|!)' | grep -Eo '[^(^\*\.)].+' | sort > "$_public_suffix"
}

__to_simple_usable_list () {
  # We remove weird characters
  tr -d '\r' < "$1" |
  # We cast everything to lower-case
  tr '[:upper:]' '[:lower:]' |
  # We ignore lines with spaces or empty lines
  grep -vE '( |^$|/)' |
  # We ignore "localhost"; this is the single most problematic host ever.
  # Many poorly configured programs will use the DNS to ask who is "localhost",
  # instead of asking the true source in nsswitch.conf(5). This causes
  # phenomenal clusterfucks.
  grep -v '^localhost$' |
  # Ignore domains in $_ignore
  grep -vE "$_ignore_regex" |
  sort -u > "$1".2
  # Ignore lines that are a TLD or a public suffix
  # comm(1) is insanely faster than any grep(1) or rg(1) here
  comm -23 "$1".2 "$_public_suffix" |
  # and sort by domain
  rev | sort | rev > "$1"
  rm "$1".2
}

__to_output () {
  if [ -z "$_output_file" ]; then
    cat -
  else
    if [ -e "$_output_file" ]; then
      # just in case something goes wrong
      # I wrote that script though. It shouldn't go wrong.
      # Right...?
      cp "$_output_file" "$_output_file".bak
    fi
    cat - > "$_output_file"
  fi
}

__create_public_suffix

# fill $_temp_dir with tagged files holding their list of domains
printf '%s\n' "$_input" | tr '^' '\n' |
 while IFS='!' read -r _tag _url _ip; do
  if [ -n "$_ip" ]; then
    __add_hosts  "$_url" "$_ip" "$_temp_dir/tags/$_tag"
  else
    __add_simple "$_url"        "$_temp_dir/tags/$_tag"
  fi
 done

# Remove public suffixes from each tags/$_tag file
for _tagged_domainlist in "$_temp_dir"/tags/*; do
  __to_simple_usable_list "$_tagged_domainlist"
done

# Slim down domain lists, as they contain a LOT of duplicates for unbound
# e.g.: www.malware.tld and malware.tld
# The output file contains `domain tag` lines only
for _tagged_domainlist in "$_temp_dir"/tags/*; do
  awk -v tag="${_tagged_domainlist##*/}" \
    'BEGIN { dom = ""; domregex="thisshouldntmatch" }
     $0 !~ domregex { if(dom != "") { printf("%s %s\n", dom, tag) }; domregex=".*\\."$0; dom=$0 }
     END { printf("%s %s\n", dom, tag) }' < "$_tagged_domainlist" > "$_tagged_domainlist".nodupes
  mv "$_tagged_domainlist".nodupes "$_tagged_domainlist"
done

# create a single list with all `domain tag` pairs
_dtst="$_temp_dir/domain_to_single_tag"
sort "$_temp_dir"/tags/* > "$_dtst"

# now we squash that list
{ cat "$_temp_ack"
  awk '$1 == domain { tags = tags " " $2 }
       $1 != domain {
         if (domain != "") {
           printf("local-zone: %s redirect\nlocal-zone-tag: %s \"%s\"\n", domain, domain, tags)
         }
         domain = $1; tags = $2 }
       END {
         if (domain != "") {
           printf("local-zone: %s redirect\nlocal-zone-tag: %s \"%s\"\n", domain, domain, tags)
         }
       }' < "$_dtst"
} | __to_output

__cleanup