#!/bin/bash

#
#	hosts file to bind9 conf file zones listing.
#
#	(Use an "include" statement in the bind9 conf file to include
#	the blacklist file from the output of this utility. You also
#	need a null.db zone file)
#
#	Copyright (C) 2012 Martin Lomas (mprogs01(at)ml1 .co .uk)
#
#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>
#
#
#
# Input is one or more hosts files, output is a bind9 zones list pointing to a null.db zone
# Anything with a local address 127.0.0. is assumed to be a blacklisted site or domain:
#   all "127.0.0." entries are zoned to a null "null.db" zone file to return a dummy dns value
# Everything else is assumed to be a comment and is listed out as a comments header
# Leading "ww*" addresses are stripped to blacklist that entire domain
#
# The hosts txt is input on std input
# A bind9 zones file output on std output
# Update the ex(  ) array below to NOT have certain domains and/or subdomains blacklisted
#
# Example usage:
# cat /etc/hosts | hosts2nullzone >blacklist_zones.conf
#
#
# Martin Lomas 2012-06-19
# v3: Add exclusions
#
# Martin Lomas 2012-06-18
# v2: Much faster using rev and sort...
# v1: Very slow repeated use of grep -iv to filter out redundant subdomains
#
#
# One source list of noisome sites (now defunct?) was:
#
#####
# Ad-blocking:
# Kill cookie plundering advertisers and trackers
# Mike Skallas(user245(at)hotmail .com)
# Available at http://everythingisnt.com/hosts.html
# Free only for Residential use.
#####
#
# Two others are:
#
# http://pgl.yoyo.org/as/
# http://someonewhocares.org/hosts/
#

# Exclusion array: a leading "." indicates all subdomains also, no leading dot means that any listed subdomains are still zappable.
# Do not blacklist these sites/domains/subdomains:
ex=( l.google.com )


# Check for required utilities:

for a in egrep sed sort uniq tr rev
do
        which $a >/dev/null 2>&1 || { echo "ERROR: No '$a' in your user search path!"; exit -1 ; }
done


# Stream the input, all tabs and multiple spaces reduced to single spaces, and convert "#" to "//" comments
# Ignore the localhost line if on the input

egrep -vi '\Wlocalhost(\.localdomain)?([[:space:]]|$)' |\
	sed -e 'sX\tX Xg;sX#X//X;sX  X Xg;sX^ //X//X' |\
	(
		# Output full comment lines and just the domain names
		c=0	# Comments count
		while read a
		do
			if [ "${a:0:8}" == '127.0.0.' ]
			then
				a="${a#* }"
				echo "${a%% *}"		# Output just the domain
			else
				(( ++c ))		# Or assume it must be a comment
				cc="0000$c"
				echo "// ${cc:(-4)} ${a#//}"
			fi
		done

	# Sort and remove duplicates
	) |\
	sort -g |\
	uniq -c |\
	sed -r -e 's/^[[:space:]]*[[:digit:]]+[[:space:]]//' |\
	(
		# Sorted into left-right order

		# echo out all the comments first:
		while read a && [ "${a:0:2}" == '//' ]
		do
			echo "$a"
		done

		# Output pretty spacing and the exclusion list

		echo -e "//\n//\n// Excluded from blacklist:\n// '${ex}'\n//\n//"

		# Output all the remaining lines reversed and sorted, filtered, and re-reversed
		# Anything ww... assumed to blacklist that entire domain

		# Pass $a, the first non-comment, into next pipe chain
		# NB: This one first domain entry bypasses the lowercase translate
		export a

		# Ensure all lowercase only for the domains
		tr [:upper:] [:lower:] |\
		(
			# The last read is already in $a

			while [ -n "$a" ]
			do
				f=''    # Comment flag

				# if a ww* domain and yet not a top level domain:
				if [ "${a:0:2}" == 'ww' ] && [ "x${a}" != "x${a#*.*.}" ]
				then
					# Check that the "ww" stripped domain isn't in the exclude list

					e=0
					f='// '
					while [ -n "${ex[e]}" ]
					do
						# if the subdomains are not excluded
						if [ "${ex[e]:0:1}" != '.' ]
						then
							# if this matches an excluded domain when truncated
							if [ ".${ex[e]}" == ".${a#*.}" ]
							then
								# Do NOT truncate the domain name
								# (That is, do not block all the subdomains)
								f=''
								break
							fi
						fi
						
						(( ++e ))
					done

				fi
				
				if [ -n "$f" ]
				then
					echo -e "${f}${a}\n${a#*.}"	# Comment what is being truncated and then output the domain minus the ww*
				else
					echo "$a"			# Or output the domain unaltered
				fi

				read a || a=''
			done
		) |\
		(
			while read a
			do
				f=''	# Comment flag

				if [ "${a:0:2}" != '//' ]
				then
					# if not already a comment:
					# Check if domain/subdomains are excluded
					e=0
					while [ -n "${ex[e]}" ] && [ -z "$f" ]
					do
						if [ "${ex[e]:0:1}" == '.' ]
						then
							# Comment out that domain and all subdomains (so they are all excluded)
							ee="${ex[e]:1}"
							l="${#ee}"
							[ "${ee}" == "${a:(-$l)}" ] && f='// '
						else
							# Comment out just that domain (so subdomains are not excluded)
							[ "${ex[e]}" == "${a}" ] && f='// '
						fi
						(( ++e ))
					done
				fi

				echo "${f}${a}"
			done

		# Reverse to right to left hierarical order and resort, and remove duplicate lines
		) |\
		rev |\
		sort -g |\
		uniq -c |\
		sed -r -e 's/^[[:space:]]*[[:digit:]]+[[:space:]]//' |\
		(
			a=''
			b='.'
			while read a
			do
				# Already commented input lines:
				if [ "${a:(-2)}" == "//" ]
				then
					echo "$a"
					continue
				fi

				echo -n "$a"
				bl="${#b}"
				if [ "${b}" == "${a:0:$bl}" ]
				then
					# Comment out the subdomain (repeat of the already listed domain)
					echo " //"
				else
					# echo for next line
					echo
					b="${a}."
				fi
			done

		# Reverse back to normal left to right
		) |\
		rev |\
		while read a
		do
			# Output the modified domains comments and zones listing

			if [ "${a:0:2}" == '//' ]
			then
				echo "$a"
			else
				echo -e "zone \"${a%% *}\"\t{ type master; notify no; file \"null.db\"; };"
			fi
		done
	)

