# Copyright (c) 2012 Flowerfire, Inc. All Rights Reserved. media_flow_controller_w3c = { plugin_version = "1.2.4" info.1.manufacturer = "Juniper" info.1.device = "Media Flow Controller (Access Logs) (W3C)" info.1.version.1 = "mfc-11.B.3" # 2012-07-06 - 1.0 - GMF - Initial creation # 2012-07-17 - 1.0.1 - GMF - Fixed bug in naming of page_views instance (no spaces allowed); capitalized page_views field. # 2012-07-23 - 1.1 - GMF - Added support for Countries/Regions/Cities based on cs-host field (and c-ip as before). Fixed support for cs_user_agent derived fields, and cs_referrer-derived fields. # 2012-08-07 - 1.2 - GMF - Protected suppress_cs_range and suppress_etag filters in case that the fields don't exist; added support for quoted fields in # Format line; categorized reports; other. # 2012-08-20 - 1.2.1 - GMF - Added conversion from cs_host to c_ip; added new parameters to page_views # 2012-08-10 - 1.2.2 - GMF - Changed name to have "Access Logs" in parentheses # 2012-09-20 - 1.2.3 - GMF - At Juniper's request, removed sc_content_length, and added sc_bytes and sc_bytes # 2012-10-17 - 1.2.4 - GMF - changed time_taken to seconds # The name of the log format log.format.format_label = "Juniper Media Flow Controller Access (W3C) Log Format" log.miscellaneous.log_data_type = "http_access" log.miscellaneous.log_format_type = "web_server" # Description contributed by Juniper create_profile_wizard_options.plugin_description = `

This plugin is designed to detect and analyze Media Flow Controller Access Logs that are recorded in the W3C Extended Log Format, first supported in MFC version 11.B.3. The W3C header record documents the log fields and their order in the log records. The information in the header record allows the W3C plug-in to parse and analyze any log file that conforms to the W3C standard regardless of the optional log fields and their order in the log records.

The W3C Log Format provides significant advantage over the NCSA log format supported by earlier versions of Media Flow Controller. Users can now choose to log any number of log fields in any order using a custom log profile. Sawmill can interpret these custom formats and generate reports for each individual log field. The Juniper Media Flow Controller NCSA plug-in can analyze 10 fields defined by the NCSA Common Log Format (CLF). The W3C plug-in can analyze 20 additional fields and the fields can be in any order as defined by the user using the log profile feature. The reports generated by this plug-in include:

` # The log is in this format if any of the first ten lines match this regular expression log.format.autodetect_regular_expression = "^# Software: Media Flow Controller " # This handles "# Format" lines, and creates log and database fields from them log.filter_preprocessor = ` if (matches_regular_expression(current_log_line(), '^#(Fields| Format): +([^ ].*)$')) then ( string fields = $2; string fieldname; v.logfieldindex = 1; string numerical_fields = "profiles." . internal.profile_name . ".database.numerical_fields"; # This subroutine creates a database field subroutine(create_database_field(string fieldname, int top, int bottom), ( # echo("create_database_field: " . fieldname); # debug debug_message("create_database_field(" . fieldname . ")\n"); string databasefieldpath = "profiles." . internal.profile_name . ".database.fields." . fieldname; (databasefieldpath . "") = ""; node databasefield = databasefieldpath; if (top ne 0) then @databasefield{"suppress_top"} = top; if (bottom ne 0) then @databasefield{"suppress_bottom"} = bottom; # sc_age is an integer field, non-aggregating but with many values. Leave it unnormalized and unindexed. if (fieldname eq "sc_age") then ( @databasefield{"type"} = "int"; @databasefield{"aggregation_method"} = "none"; @databasefield{"index"} = false; ); databasefield; )); subroutine(create_log_field(string fieldname, string type), ( debug_message("create_log_field(" . fieldname . "; type=" . type . ")\n"); string logfieldpath = "profiles." . internal.profile_name . ".log.fields." . fieldname; (logfieldpath . "") = ""; node logfield = logfieldpath; if (type ne '') then set_subnode_value(logfield, "type", type); logfield; )); log.format.parsing_regular_expression = '^'; node log_fields_at_end = new_node(); subroutine(create_log_field_at_end(node log_fields_at_end, string fieldname, string type), ( log_fields_at_end{fieldname}{"type"} = type; )); # Extract the fields on at a time while (matches_regular_expression(fields, '^([^ ]+) (.*)$') or matches_regular_expression(fields, '^([^ ]+)$')) ( string unconverted_fieldname = $1; fields = $2; # Strip quotes from field name, if any if (matches_regular_expression(unconverted_fieldname, '^"(.*)"$')) then unconverted_fieldname = $1; # Clean up the field name fieldname = ''; for (int i = 0; i < length(unconverted_fieldname); i++) ( string c = lowercase(substr(unconverted_fieldname, i, 1)); if (!matches_regular_expression(c, '^[a-z0-9]$')) then c = '_'; fieldname .= c; ); while (matches_regular_expression(fieldname, '^(.*)_$')) fieldname = $1; if (fieldname eq 'cs_uri') then ( fieldname = 'cs_uri_stem'; ); # 2012-08-20 - GMF - In earlier versions of this log format, the client IP field is incorrectly called cs_host. This compensates for that by assuming that whenever cs_host is seen, it really means c_ip. if (fieldname eq 'cs_host') then fieldname = 'c_ip'; #echo("fieldname: " . fieldname); # Get the log field type string log_field_type = ''; if (fieldname eq 'cs_referer') then ( log_field_type = 'url'; ); else if (fieldname eq 'cs_uri_stem') then ( log_field_type = 'page'; ); else if (fieldname eq 'cs_user_agent') then ( log_field_type = 'agent'; ); else if ((fieldname eq 'c_ip') or (fieldname eq "cs_host")) then ( log_field_type = 'host'; ); # Create the log field (but not cs_request, which is really three log fields, created below. if (fieldname eq "cs_request") then ( create_log_field('cs_method', ''); create_log_field('cs_uri_stem', 'page'); create_log_field('cs_protocol', ''); create_log_field_at_end(log_fields_at_end, 'file_type', 'file'); create_log_field_at_end(log_fields_at_end, 'screen_dimensions', ''); create_log_field_at_end(log_fields_at_end, 'screen_depth', ''); log.format.parsing_regular_expression .= '([^ ]+) ([^ ]+) ([^ ]+)'; ); else if (fieldname eq 'cs_referer') then ( create_log_field(fieldname, log_field_type); create_log_field_at_end(log_fields_at_end, 'search_engine', ''); create_log_field_at_end(log_fields_at_end, 'search_phrase', ''); log.format.parsing_regular_expression .= '([^ ]+)'; ); else if (fieldname eq 'cs_user_agent') then ( create_log_field(fieldname, log_field_type); create_log_field_at_end(log_fields_at_end, 'operating_system', ''); create_log_field_at_end(log_fields_at_end, 'web_browser', ''); create_log_field_at_end(log_fields_at_end, 'spider', ''); log.format.parsing_regular_expression .= '(.*)'; ); else if (fieldname eq "time") then ( create_log_field('date', ''); create_log_field('time', ''); log.format.parsing_regular_expression .= '[[]([^:]+):([^ ]+) [^]]+[]]'; ); else ( create_log_field(fieldname, log_field_type); log.format.parsing_regular_expression .= '([^ ]+)'; ); # If there's another field after this one, add a space to the regular expression if (length(fields) > 0) then log.format.parsing_regular_expression .= ' '; # If we're creating a profile, create the database fields too. if (node_exists("volatile.creating_profile")) then ( # Handle localtime by creating date_time and derived database fields if (fieldname eq "time") then ( create_database_field('date_time', 0, 0); create_database_field('day_of_week', 0, 0); create_database_field('hour_of_day', 0, 0); ); # if time # Create derived fields for agent else if (fieldname eq "cs_user_agent") then ( create_database_field('operating_system', 0, 0); create_database_field('web_browser', 0, 0); create_database_field('spider', 0, 0); ); # Create database field cs_ip and derived field for client IP else if ((fieldname eq "c_ip") or (fieldname eq "cs_host")) then ( create_database_field(fieldname, 0, 0); create_database_field('location', 0, 0); ); # Create database field cs_referer and derived fields for referrer else if (fieldname eq "cs_referer") then ( create_database_field('cs_referer', 1, 9); create_database_field('search_engine', 0, 0); create_database_field('search_phrase', 0, 0); ); # Create derived file type field else if (fieldname eq "cs_request") then ( create_database_field('cs_method', 0, 0); create_database_field('cs_uri_stem', 0, 9); create_database_field('file_type', 0, 0); create_database_field('screen_dimensions', 0, 0); create_database_field('screen_depth', 0, 0); create_database_field('cs_protocol', 0, 0); ); # Don't create a database field for x_origin_fetch_size (is reporting required on this field?) else if (fieldname eq "x_origin_fetch_size") then ( ); # Don't add a database field for numerical fields else if (subnode_exists(numerical_fields, fieldname)) then ( debug_message("Not adding numerical field: " . fieldname . "\n"); ); # Create a normal database field else create_database_field(fieldname, 0, 0); ); # if creating profile ); # while another field # Create any final log fields node lfae; foreach lfae log_fields_at_end ( #echo("Final log field creation: " . node_name(lfae)); create_log_field(node_name(lfae), @lfae{"type"}); ); # Don't parse the #Fields line as a data line 'reject'; ); # if # Format # Don't parse any other # lines as data lines else if (starts_with(current_log_line(), '#')) then ( 'reject'; ); ` # Log fields log.fields = { } # log.fields # Database fields database.fields = { } # database.fields # Get web browser, operating system, web browser, and spider information from the user-agent field. log.parsing_filters.derive_from_user_agent = { value = ` get_user_agent_info(cs_user_agent); web_browser = volatile.web_browser; operating_system = volatile.operating_system; spider = volatile.spider; ` requires_fields = { cs_user_agent = true } } # derive_from_user_agent # Log Filters log.filters = { remove_query = { label = "$lang_admin.log_filters.remove_query_label" comment = "$lang_admin.log_filters.remove_query_comment" value = "if (contains(page, '?')) then page = substr(page, 0, index(page, '?') + 1) . '(parameters)';" } # remove_query suppress_cs_range = { label = "$lang_admin.log_filters.suppress_field_label" comment = "$lang_admin.log_filters.suppress_field_comment" value = "cs_range = '[suppressed]'" requires_fields = { cs_range = true } } # suppress_cs_range suppress_etag = { label = "$lang_admin.log_filters.suppress_field_label" comment = "$lang_admin.log_filters.suppress_field_comment" value = "sc_etag = '[suppressed]'" requires_fields = { sc_etag = true } } # suppress_etag mark_entry = { label = '$lang_admin.log_filters.mark_entry_label' comment = '$lang_admin.log_filters.mark_entry_comment' value = 'accesses = 1;' } # mark_entry } # log.filters log.field_options = { sessions_page_field = "cs_uri_stem" sessions_visitor_id_field = "cs_host" sessions_event_field = "page_views" } # log.field_options database.numerical_fields = { accesses = { default = true requires_log_field = false entries_field = true } # accesses unique_client_ips = { log_field = "cs_host" type = "unique" } # unique_client_ips sc_bytes = { type = "int" integer_bits = 64 display_format_type = "bandwidth" } # sc_bytes cs_bytes = { type = "int" integer_bits = 64 display_format_type = "bandwidth" } # cs_bytes sc_bytes_content = { type = "int" integer_bits = 64 display_format_type = "bandwidth" } # sc_bytes_content # Removed per Juniper suggestion. # sc_content_length = { # type = "int" # integer_bits = 64 # display_format_type = "bandwidth" # } # sc_content_length time_taken = { type = "int" integer_bits = 64 display_format_type = duration_compact } # time_taken x_time_used_ms = { type = "int" integer_bits = 64 display_format_type = duration_milliseconds } # x_time_used_ms } # database.numerical_fields create_profile_wizard_options = { # How the reports should be grouped in the report menu report_groups = { date_time_group = "" hit_type = "" content_group = { cs_uri_stem = true file_type = true x_namespace = true cs_host = true x_server = true } client_group = { user = true x_remote_user = true } visitor_demographics_group = { hostname = true domain_description = true location = true organization = true isp = true domain = true authenticated_user = true } visitor_systems_group = { screen_dimensions = true screen_depth = true web_browser = true operating_system = true } referrer_group = { cs_referer = true search_engine = true search_phrase = true # search_phrase_by_search_engine = true } other_group = { s_port = true sc_status = true worm = true spider = true server_domain = true server_response = true pragma_in = true pragma_out = true cache_control_in = true cache_control_out = true vary_out = true x_cache_hit = true protocol = true operation = true cs_method = true cs_protocol = true x_revalidate_cache = true sc_substatus = true } } # report_groups snapons = { # Attach a page_views field page_views = { snapon = "page_views" name = "page_views" label = "$lang_admin.snapons.page_views.label" prompt_to_attach = true prompt_to_attach_default = true parameters = { file_type_field.parameter_value = "file_type" server_response_field.parameter_value = "sc_status" page_field.parameter_value = "cs_uri_stem" page_views_field_name.final_node_name = "page_views" page_views_field_name.parameter_value = "{=capitalize(lang_stats.field_labels.page_views)=}" } # parameters requires_log_fields = { cs_uri_stem = true sc_status = true } } # page_views # Add the standard reports add_standard_reports = { name = "add_standard_reports" label = "add_standard_reports" snapon = "add_standard_reports" } # add_standard_reports } # snapons } # create_profile_wizard_options } # media_flow_controller_w3c