# this module does the main scrutinizer work. 
# all the test,rating and alert functions are implemented
# here. 

package SM::Scrutinizer;
use SM::GV qw(%log_record); # Global Variable
use SM::Conf;
use SM::Defs;
use SM::Statistic;
use Data::Dumper;
use strict;
use warnings;

# this function calcluates the
# weighted moving average of an
# active session (request by request)
sub calc_req_wma {
	# get the arguments
	my ($rec) = @_;


	# do we have enough periodes for a ewma?
	if($rec->{time_periodes} < $SM::Conf::WMA_2_EWMA) { # NO

			# calculate a weighted moving average, cause its
			# not possible to do a exponential moving average
			# if you have to less values (or your ewma would
			# be wrong).

			# the short time moving average
			$rec->{req_st} = (
					# multiply the current ma ...
					$rec->{req_st} *
					# sum of weights of all the previous ma.
					$SM::GV::sum[$rec->{time_periodes}-1] +
					# add the current ma ...
					$rec->{hits_per_second} *
					# with the weight of the current periode
					$rec->{time_periodes} 
					# devide the hole sum by all weights 
				) / $SM::GV::sum[$rec->{time_periodes}];

			# the long time moving average.
			# this one is calculated WIDOUT weights!
			$rec->{req_lt} = (
					$rec->{req_lt} * 
					($rec->{time_periodes}-1) + 
					$rec->{hits_per_second}	
				) / $rec->{time_periodes};

	} else { # YES -> do a ewma

			# calculate the two ewma for the short and
			# long time. the only difference of the two
			# is the alpha factor.	
			$rec->{req_st} = 
				$SM::Conf::ALPHA_ST * 
				$rec->{hits_per_second} + 
				(1-$SM::Conf::ALPHA_ST) *
				$rec->{req_st};

			$rec->{req_lt} = 
				$SM::Conf::ALPHA_LT * 
				$rec->{hits_per_second} + 
				(1-$SM::Conf::ALPHA_LT) *
				$rec->{req_lt};

	}

	# uptate the max_ma if necessary
	if($rec->{req_st}>$rec->{max_ma}) {
			$rec->{max_ma} = $rec->{req_st};
	}

}

# this function calcluates
# the hit ratio of the
# different requests of a
# session 
sub calc_hit_ratio {
	# get the arguments
	my ($rec) = @_;

	# create a variable for
	# storing the result
	my $hit_ratio=0;
	# loop thru all requests transmit
	# by one client in his active session
	foreach my $uri (keys(%{$rec->{e}})) {

		# calcluate the ratio
		$hit_ratio+=$rec->{e}{$uri}{hits}**2/$rec->{count_sys};

		# search the most hit uri
		if($rec->{max_uri_hits} < $rec->{e}{$uri}{hits}) {
			$rec->{max_uri_hits} = $rec->{e}{$uri}{hits};
		}
	}
	# save the calculated ratio
	$rec->{hit_ratio} = $hit_ratio;

	# save the peak value of this session
	if($hit_ratio>$rec->{max_hit_ratio}) {
		$rec->{max_hit_ratio} = $hit_ratio;
	}

	
}

# this function cleans up the
# data structure. It removes
# uris from the statistics
# which weren't hit a for 
# defined time.
sub remove_old_uris {
	# get the arguments
	my ($rec) = @_;

	# reset the global
	# hit counter
	$rec->{count_sys}=0;

	# loop thru each uri in the data structure
	# of the clients active session
	foreach my $uri (keys(%{$rec->{e}})) {
		# check if an uri is outdated
		if($log_record{timestamp}-$rec->{e}{$uri}{last_ts}>$SM::Conf::TIMEOUT_URI)
		{
			# remove this uri
			delete $rec->{e}{$uri};
		} else {
			# increment the global
			# hit counter
			$rec->{count_sys}+=$rec->{e}{$uri}{hits};
		}
	}

	# loop thru each timestamp in the data structure
	# of the clients active session
	foreach my $ts (keys %{$rec->{ts}}) {
		# check if a timestamp is outdated
		if($ts+$SM::Conf::TIMEOUT_URI<$SM::GV::log_record{timestamp})
		{
			# remove this timestamp
			delete $rec->{ts}{$ts};
		}
	}

}

# this function calculates
# the ratio between the
# different return stats
# of the requests.
sub alert_function_ret_status {
	# get the arguments
	my ($rec) = @_;
	# create a variable for
	# storing the result
	my ($result);

	# return code alert "function"
	my $c2=$rec->{methods}{2} || 0; # code 2xx events
	my $c3=$rec->{methods}{3} || 0; # code 3xx events
	my $c4=$rec->{methods}{4} || 0; # code 4xx events
	my $c5=$rec->{methods}{5} || 0; # code 5xx events


	# get the base level
	# from the config file
	$result = $SM::Conf::f_ret_status{base};

	# if return status 2XX or 3XX
	if($c2 || $c3) { 
		# recalculate
		$result+=($c2/($c2+$c3))*$SM::Conf::f_ret_status{stretch};
	} 

	# if return status 4XX
	if($c4||$c5) {
		# recalculate
		$result+=log($c4+$c5)/log($SM::Conf::f_ret_status{logbase}) *
				 $SM::Conf::f_ret_status{stretch};
	}

	# if statistics are enabled and
	# at least one 2XX or 3XX occured
	if($SM::Conf::DO_STATISTICS && ($c2 || $c3)) {
			# add them to the statistics of
			# the return codes
			$SM::Statistic::stat{alert_code}[
				int(($c2/($c2+$c3))*100)
			]++;
	}

	# return the calculated result
	return $result;
}


# this function calcluates
# the ratio between the
# different filetypes.
sub alert_function_file_type {
	# get the arguments
	my ($rec) = @_;
	# create a variable for
	# storing the result
	my ($result);

	# define the different types of files
	my $f0=$rec->{filetypes}{ DYNAMIC."" } || 0;
	my $f1=$rec->{filetypes}{ STATIC."" }  || 0;
	my $f2=$rec->{filetypes}{ PICTURE."" } || 0;

	# get the base from the config file
	$result = $SM::Conf::f_file_type{base}; 
	# recalculate
	$result+= $f0/($f0+$f1+$f2)*$SM::Conf::f_file_type{stretch};

	# return the calculated result
	return $result;
}

# this function assigns
# the distribution of
# the requests in time
sub calc_chi {
	# get the arguments
	my ($rec) = @_;
	# create some local
	# variables for calculations
	my $choices=0;
	my $elements=0;
	# get the current timestamp
	my $smalest_ts=$log_record{timestamp};


	# count how many choices we have -> $choices
	# count the number of elements in the bins -> $elements 
	foreach my $ts (keys %{$rec->{ts}}) {
		$elements+=$rec->{ts}{$ts};
		# readjust the timestamp
		# to get the earliest
		if($ts<$smalest_ts) {
			# save the earliest
			# timestamp
			$smalest_ts=$ts;
		}
	}	

	# get the range of time in which
	# requests occured in the clients
	# active session
	$choices=$log_record{timestamp}-$smalest_ts;	

	# we need atleast two choices that we can calculate a
	# chi^2 distribution
	if($choices>2) {
		
		# calculate the expected value
		my $expected=$elements/$choices;

		# this variable will sum up the aberreation 
		my $aberreation=0;

		# for each point in time a
		# request occured
		foreach my $key (keys %{$rec->{ts}}) {
			# calculate the aberreation
			$aberreation+=($rec->{ts}{$key}-$expected)**2/$expected;
		}

		# calculate the normed chi value
		$rec->{chi}=$aberreation/($choices-1);

	# if less than two choices are
	# available
	} else {
		# set the chi value to zero
		$rec->{chi}=0; 
	}
}

# apply alert function based on 3 parameters with
# rotation of the function when it reaches the maximal
# value.
sub apply_alert_function_3p_wr {
	my ($f, $X) = @_;
	my ($result);

	# normal case, just the function value
	# based on the parameters found during
	# fitting by gnuplot
	if($X<=$f->{max}) {
		$result = 
			$f->{base} +
			$f->{stretch} *
				(1-$f->{p}{a} *
					($X - $f->{p}{b}) **
					$f->{p}{c}
				);

	# we rotate the function arount the upper right corner 
	# of the graph very mathematically spoken ;-)
	# for values that we didn't see during training.
	} elsif($X>$f->{max} && $X < 2*$f->{max}) {

		# base part
		my $sub_result= 
			$f->{base} +
			$f->{stretch} *
				(1-$f->{p}{a} *
					($X - $f->{p}{b}) **
					$f->{p}{c}
				);

		# base + rotated function value 
		$result =
			$sub_result +
			$f->{stretch} *
			($sub_result -
				(1-$f->{p}{a} *
				((
					2*$f->{max}-$X
				 ) - $f->{p}{b}) **
				$f->{p}{c})
			);

	# continue the alert level in a linar fashion
	} else {

		# base value
		my $sub_result= 
			$f->{base} + 2 *
			$f->{stretch} *
				(1-$f->{p}{a} *
					($f->{max} - $f->{p}{b}) **
					$f->{p}{c}
				);

		# base + linear part
		$result = 
			$sub_result +	
			($X-2*$f->{max}) / $f->{max}
			
	}

	# if we get a wrong result because of a negative alert
	# value.  shouldn't happen when the training was successful 
	if($result<0) {
		# print errors
		print STDERR "\nSERIOUS WARNING: got an alert value < 0 -> $result\n";
		print STDERR "check your graphs and adjust the base!\n\n";

	}

	# return the calculated result
	return $result;
}

# apply alert function based on 2 parameters 
sub apply_alert_function_2p {

	# get the agruments
	my ($f, $X) = @_;
	# creeate a variable for
	# storing the result
	my ($result);

	# function value based on the parameters
	# found during training by gnuplot.
	$result = 
		$f->{base} +
		$f->{stretch} *
			($f->{p}{d} *
				($f->{p}{e} ** $X)
			);

	# if we get a wrong result
	# because of a negative alert
	# value (should never happen)
	if($result<0) {
		# print errors
		print STDERR "\nSERIOUS WARNING: got an alert value < 0 -> $result\n";
		print STDERR "check your graphs and adjust the base!\n\n";

	}
	
	# return the calculated result
	return $result;
}

# this function gets the
# average load level of the
# cpu from the proc filesystem.
sub get_load_average {
	# open the inode from the proc filesystem
	open(FH, "/proc/loadavg") || warn "not able to open /proc/loadavg: $!";
	# get the filehandle
	my $entry=<FH>;
	# parse out the cpu-load
	if($entry=~m/^([^ ]+)/) {
		# save the cpu load
		# into a global variable
		$SM::GV::load_average=$1;
	}
	# close the filehandle
	close(FH);
}

# this function puts all together.
# it calls each calculating function
# and calculates the final alert
# value
sub scrutinize {

	# get the arguments
	my ($cur) = @_;
	# create local variables for the sub results
	my ($a_req_st, $a_req_lt, $a_periode, $a_time_dist, $a_uri_spreading,
			$a_ret_status, $a_file_type);
	my ($hits_in_last_sec);

	my $FHA;

	# do calculations
	my $time = $SM::GV::log_record{timestamp} - $cur->[VAL]->{last_ts}; 

	# loop until time reaches
	# a negative value
	while($time>0) {

		# get the periodes
		++$cur->[VAL]->{time_periodes};

		# calculate the weighted moving average
		calc_req_wma($cur->[VAL]);		

		# the calculations are just done one and not for every
		# timeperiode in which no request didn't arrive
		next if($time--!=$SM::GV::log_record{timestamp}-$cur->[VAL]->{last_ts});	

		# clean up the stored uris
		remove_old_uris($cur->[VAL]);

		# calcluate the chi value
		# of the distribution in time
		calc_chi($cur->[VAL]);		
	
		# calculate the hit ratio
		calc_hit_ratio($cur->[VAL]);		

		# get the number of hits in the last second
		$hits_in_last_sec = $cur->[VAL]->{hits_per_second};
		# reset the number of hits in the last second
		$cur->[VAL]->{hits_per_second}=0;

		# generate the starting alert value
		my ($alertvalue) = (1);

		# all these statistical values
		# can be defined as active or
		# inactive. Depending on this
		# they'll have influence on
		# the alert value.	

		# if the request ratio test is enabled
		if($SM::Conf::DO_REQUEST_RATIO) {
			# generate the function for 
			# the short time request ratio
			$a_req_st		= apply_alert_function_3p_wr(
							\%SM::Conf::f_request_ratio,
							 $cur->[VAL]{req_st}
						  	);
			# recalc the alert value
			$alertvalue		*= $a_req_st;

			# and the long time request
			# ratio
			$a_req_lt		= apply_alert_function_3p_wr(
							\%SM::Conf::f_request_ratio,
							 $cur->[VAL]{req_st}
						  	);
			# recalc the alert value
			$alertvalue		*= $a_req_lt;

		}

		# if the period test is enabled
		if($SM::Conf::DO_PERIODES) {
			# generate the period duration
			# function
			$a_periode		= apply_alert_function_3p_wr(
							\%SM::Conf::f_periodes,
							 $cur->[VAL]{time_periodes}
							  );
			# recalc the alert value
			$alertvalue		*= $a_periode;
		}

		# if the time distribution test is enabled
		if($SM::Conf::DO_TIME_DIST) {
			# generate the time distribution
			# function
			$a_time_dist    = apply_alert_function_2p(
							\%SM::Conf::f_time_dist,
							 $cur->[VAL]{chi}
						  );
			# recalc the alert value
			$alertvalue		*= $a_time_dist;
		}

		# if the uri spreading test is enabled
		if($SM::Conf::DO_URI_SPREADING) {
			# generate the uri spreading
			# function
			$a_uri_spreading= apply_alert_function_3p_wr(
							\%SM::Conf::f_uri_spreading,
							 $cur->[VAL]{hit_ratio}
							  );
			# recalc the aler value
			$alertvalue		*= $a_uri_spreading;
		}

		# if the return status test is enabled
		if($SM::Conf::DO_RET_STATUS) {
			# generate the return status
			# function
			$a_ret_status	= alert_function_ret_status($cur->[VAL]);
			# recalc the alert value
			$alertvalue		*= $a_ret_status;
		}

		# if the file type test is enabled
		if($SM::Conf::DO_FILE_TYPE) {
			# generate the file type
			# function
			$a_file_type	= alert_function_file_type($cur->[VAL]); 
			# recalc the alert value
			$alertvalue		*= $a_file_type;
		}
	
		# save the alert value for the
		# current client entry
		$cur->[VAL]{alert} = $alertvalue;

		# check if the alert level reaches
		# predefined tresholds
		
		# level one
		if($cur->[VAL]{alert} > $SM::Conf::ALERT_LEVEL0) {
	
			# log this entry on level one
			*FHA = *SM::GV::FHA0;			

			# level two
			if($cur->[VAL]{alert} > $SM::Conf::ALERT_LEVEL1) {

				# log this entry on level two
				*FHA = *SM::GV::FHA1;			

				# if alert debugging is enabled
				if($SM::Conf::ALERT_DEBUG_INFO) {
					# opens or creates a new debug_IP file with
					# additional detail informations
					open(FH, ">", $SM::Conf::LOG_DIR."debug_".$cur->[VAL]->{ip}) || 
						warn "can't write debug alert file $!\n";
					# set the indent mode 
					$Data::Dumper::Indent=1;
					# write the data structure into the file
					print FH Data::Dumper->Dump([\%{$cur->[VAL]}], [qw(*node)]);
					# close the file
					close(FH);
	
				}

				# check if the client is already on blacklist
				if(SM::Blacklist::is_on_blacklist($cur->[VAL]->{ip}) < 1) {

					# if not ban it!
					SM::Blacklist::ban($cur->[VAL]->{ip},
							0,					# 0 = take default timeout
							"LEVEL1",			# blocking level
							0,					# data from scrutinizer = 0
							\%{$cur->[VAL]});	# reference to data structur
				}

				# level three
				if($cur->[VAL]{alert} >= $SM::Conf::ALERT_LEVEL2) {
					# log this entry on level three
					*FHA = *SM::GV::FHA2;			

					# check if the client isn't on blacklist or just on
					# level 1
					if(SM::Blacklist::is_on_blacklist($cur->[VAL]->{ip}) < 2) {

						# ban it on level 2
						SM::Blacklist::ban($cur->[VAL]->{ip},
								0,					# 0 = take default timeout
								"LEVEL2",			# blocking level
								0,					# data from scrutinizer = 0
								\%{$cur->[VAL]});	# reference to data structur
					}


				}

			}


			# write the different statistical values
			# into the corresponding alert level logfile
			my $str="";
			$str.=sprintf("%.3f ",$cur->[VAL]{alert}); 
			$str.=sprintf(" rst %.3f",$a_req_st) if($SM::Conf::DO_REQUEST_RATIO); 
			$str.=sprintf(" rlt %.3f",$a_req_lt) if($SM::Conf::DO_REQUEST_RATIO); 
			$str.=sprintf(" p %.3f",$a_periode) if($SM::Conf::DO_PERIODES);
			$str.=sprintf(" td %.3f",$a_time_dist) if($SM::Conf::DO_TIME_DIST); 
			$str.=sprintf(" us %.3f",$a_uri_spreading) if($SM::Conf::DO_URI_SPREADING); 
			$str.=sprintf(" rs %.3f",$a_ret_status) if($SM::Conf::DO_RET_STATUS); 
			$str.=sprintf(" ft %.3f",$a_file_type) if($SM::Conf::DO_FILE_TYPE);  
			$str.=sprintf(" %s ip: %s",$SM::GV::log_record{ts}, $cur->[VAL]{ip});
			$str.=sprintf(" req_st: %.2f req_lt: %.2f",
								$cur->[VAL]{req_st},
								$cur->[VAL]{req_lt});
			$str.=sprintf(" count sec: %i", $hits_in_last_sec);
			$str.=sprintf(" count sum: %i",$cur->[VAL]{count_sum});
			$str.=sprintf(" count sys: %i",$cur->[VAL]{count_sys});
			$str.=sprintf(" periodes: %i",$cur->[VAL]{time_periodes}); 
			$str.=sprintf(" chi: %.3f ",$cur->[VAL]{chi}); 
			$str.=sprintf(" hit_ratio: %.2f\n",$cur->[VAL]{hit_ratio}); 

			# write the informations into the logfile
			print FHA $str;
		}

		# if statistics are enabled	
		if($SM::Conf::DO_STATISTICS) {
			# calculate the alert level distribution
			$SM::Statistic::stat{alert_dist}[int($cur->[VAL]{alert}*100)]++;
		}
	
	}

	# save the last logging time
	$cur->[VAL]->{last_ts}=$SM::GV::log_record{timestamp};

	# reset the hits per second counter.
	# 1 means this is was the first request for 
	# the next periode ...
	$cur->[VAL]->{hits_per_second}=1;
}

1;
