# # two_pass_spider_analysis snapon # # This snapon adds a log filter which rejects entries from spiders, where a spider is defined to be an IP address which never hits a CSS or JS file. The database must be built *twice* for this to work properly. # # 2012-02-07 - GMF - 1.0 - Initial implementation # two_pass_spider_analysis = { label = "$lang_admin.snapons.two_pass_spider_analysis.label" comment = "$lang_admin.snapons.two_pass_spider_analysis.comment" config_snapon_category = "" parameters = { source_ip_field = { parameter_value = "hostname" validation_type = "string" form_element_label = "$lang_admin.snapons.two_pass_spider_analysis.parameters.source_ip_field.form_element_label" form_element_type = "select" select_options_source = "database_fields" description = "" } # source_ip_field file_type_field = { parameter_value = "file_type" validation_type = "string" form_element_label = "$lang_admin.snapons.two_pass_spider_analysis.parameters.file_type_field.form_element_label" form_element_type = "select" select_options_source = "database_fields" description = "" } # file_type_field url_field = { parameter_value = "page" validation_type = "string" form_element_label = "$lang_admin.snapons.two_pass_spider_analysis.parameters.url_field.form_element_label" form_element_type = "select" select_options_source = "database_fields" description = "" } # url_field } # parameters parameters_form = { group_1 = { description = "$lang_admin.snapons.two_pass_spider_analysis.parameters_form.group_1.description" parameters = { source_ip_field = true file_type_field = true url_field = true } # parameters } # group 1 } # parameters_form attach_operations = { add_log_filter_initializations = { type = "add_log_filter_initializations" log_filter_initializations = { initialize_two_pass_spider_analysis = ` two_pass_spider_info.accessed_js_css_file = ''; two_pass_spider_info.accessed_robots_txt = ''; node two_pass_spider_info = 'two_pass_spider_info'; node accessed_robots_txt = two_pass_spider_info{'accessed_robots_txt'}; node accessed_js_css_file = two_pass_spider_info{'accessed_js_css_file'}; string converted_ip; ` } # log_filter_initializations } # add_filter_initialization add_log_filter_finalizations = { type = "add_log_filter_finalizations" log_filter_finalizations = { finalize_two_pass_spider_analysis = ` save_node('two_pass_spider_info.accessed_js_css_file'); save_node('two_pass_spider_info.accessed_robots_txt'); ` } # log_filter_finalizations } # add_log_filter_finalizations # When attaching: Add the two_pass_spider_analysis report field add_log_filters = { type = "add_log_filters" filters = { reject_spiders = { label = "Reject spiders" comment = "Reject hits from spiders. Requires database to be built twice in succession. Added by the Two-pass Spider Analysis snapon." value = ` # Convert . to _ in the IP converted_ip = replace_all({= @parameters{'source_ip_field'}{'parameter_value'} =}, '.', '_'); # If this is a JS file, remember that this IP accessed it. if (({= @parameters{'file_type_field'}{'parameter_value'} =} eq 'JS') or ({= @parameters{'file_type_field'}{'parameter_value'} =} eq 'CSS')) then ( @accessed_js_css_file{converted_ip} = true; #echo('found access on JS or CSS file by ' . {= @parameters{'source_ip_field'}{'parameter_value'} =}); ); # If this is /robots.txt, remember that this IP accessed it. if (ends_with({= @parameters{'url_field'}{'parameter_value'} =}, '/robots.txt')) then ( @accessed_robots_txt{converted_ip} = true; #echo('found access on robots.txt by ' . {= @parameters{'source_ip_field'}{'parameter_value'} =}); ); # Reject as spiders any hit which did not access JS/CSS files, or did access /robots.txt if (accessed_robots_txt?{converted_ip}) then ( #echo('Rejecting hit from "' . {= @parameters{'source_ip_field'}{'parameter_value'} =} . "' because that IP has a hit on /robots.txt"); 'reject'; ); if (!(accessed_js_css_file?{converted_ip})) then ( #echo('Rejecting hit from "' . {= @parameters{'source_ip_field'}{'parameter_value'} =} . "' because that IP has never hit a CSS or JS URL"); 'reject'; ); ` } # reject_spiders } # filters } # add_log_filters } # attach_operations } # two_pass_spider_analysis