# Copyright (c) 2012 Flowerfire, Inc. All Rights Reserved.
media_flow_controller_w3c = {
plugin_version = "1.2.4"
info.1.manufacturer = "Juniper"
info.1.device = "Media Flow Controller (Access Logs) (W3C)"
info.1.version.1 = "mfc-11.B.3"
# 2012-07-06 - 1.0 - GMF - Initial creation
# 2012-07-17 - 1.0.1 - GMF - Fixed bug in naming of page_views instance (no spaces allowed); capitalized page_views field.
# 2012-07-23 - 1.1 - GMF - Added support for Countries/Regions/Cities based on cs-host field (and c-ip as before). Fixed support for cs_user_agent derived fields, and cs_referrer-derived fields.
# 2012-08-07 - 1.2 - GMF - Protected suppress_cs_range and suppress_etag filters in case that the fields don't exist; added support for quoted fields in # Format line; categorized reports; other.
# 2012-08-20 - 1.2.1 - GMF - Added conversion from cs_host to c_ip; added new parameters to page_views
# 2012-08-10 - 1.2.2 - GMF - Changed name to have "Access Logs" in parentheses
# 2012-09-20 - 1.2.3 - GMF - At Juniper's request, removed sc_content_length, and added sc_bytes and sc_bytes
# 2012-10-17 - 1.2.4 - GMF - changed time_taken to seconds
# The name of the log format
log.format.format_label = "Juniper Media Flow Controller Access (W3C) Log Format"
log.miscellaneous.log_data_type = "http_access"
log.miscellaneous.log_format_type = "web_server"
# Description contributed by Juniper
create_profile_wizard_options.plugin_description = `
This plugin is designed to detect and analyze Media Flow Controller Access Logs that are recorded in the W3C Extended Log Format, first supported in MFC version 11.B.3. The W3C header record documents the log fields and their order in the log records. The information in the header record allows the W3C plug-in to parse and analyze any log file that conforms to the W3C standard regardless of the optional log fields and their order in the log records.
The W3C Log Format provides significant advantage over the NCSA log format supported by earlier versions of Media Flow Controller. Users can now choose to log any number of log fields in any order using a custom log profile. Sawmill can interpret these custom formats and generate reports for each individual log field. The Juniper Media Flow Controller NCSA plug-in can analyze 10 fields defined by the NCSA Common Log Format (CLF). The W3C plug-in can analyze 20 additional fields and the fields can be in any order as defined by the user using the log profile feature. The reports generated by this plug-in include:
- Date and Time based reports such as daily, weekly, monthly & yearly reports of user traffic
- Content analysis reports based on content popularity, server domain(s), server response status and content referrers
- User demographic reports based on user's Country/State/City/Region, and ISP
- Many other custom reports like Cacheable vs Non Cacheable Objects, Cache Control headers etc.
`
# The log is in this format if any of the first ten lines match this regular expression
log.format.autodetect_regular_expression = "^# Software: Media Flow Controller "
# This handles "# Format" lines, and creates log and database fields from them
log.filter_preprocessor = `
if (matches_regular_expression(current_log_line(), '^#(Fields| Format): +([^ ].*)$')) then (
string fields = $2;
string fieldname;
v.logfieldindex = 1;
string numerical_fields = "profiles." . internal.profile_name . ".database.numerical_fields";
# This subroutine creates a database field
subroutine(create_database_field(string fieldname, int top, int bottom), (
# echo("create_database_field: " . fieldname); # debug
debug_message("create_database_field(" . fieldname . ")\n");
string databasefieldpath = "profiles." . internal.profile_name . ".database.fields." . fieldname;
(databasefieldpath . "") = "";
node databasefield = databasefieldpath;
if (top ne 0) then
@databasefield{"suppress_top"} = top;
if (bottom ne 0) then
@databasefield{"suppress_bottom"} = bottom;
# sc_age is an integer field, non-aggregating but with many values. Leave it unnormalized and unindexed.
if (fieldname eq "sc_age") then (
@databasefield{"type"} = "int";
@databasefield{"aggregation_method"} = "none";
@databasefield{"index"} = false;
);
databasefield;
));
subroutine(create_log_field(string fieldname, string type), (
debug_message("create_log_field(" . fieldname . "; type=" . type . ")\n");
string logfieldpath = "profiles." . internal.profile_name . ".log.fields." . fieldname;
(logfieldpath . "") = "";
node logfield = logfieldpath;
if (type ne '') then
set_subnode_value(logfield, "type", type);
logfield;
));
log.format.parsing_regular_expression = '^';
node log_fields_at_end = new_node();
subroutine(create_log_field_at_end(node log_fields_at_end, string fieldname, string type), (
log_fields_at_end{fieldname}{"type"} = type;
));
# Extract the fields on at a time
while (matches_regular_expression(fields, '^([^ ]+) (.*)$') or matches_regular_expression(fields, '^([^ ]+)$')) (
string unconverted_fieldname = $1;
fields = $2;
# Strip quotes from field name, if any
if (matches_regular_expression(unconverted_fieldname, '^"(.*)"$')) then
unconverted_fieldname = $1;
# Clean up the field name
fieldname = '';
for (int i = 0; i < length(unconverted_fieldname); i++) (
string c = lowercase(substr(unconverted_fieldname, i, 1));
if (!matches_regular_expression(c, '^[a-z0-9]$')) then
c = '_';
fieldname .= c;
);
while (matches_regular_expression(fieldname, '^(.*)_$'))
fieldname = $1;
if (fieldname eq 'cs_uri') then (
fieldname = 'cs_uri_stem';
);
# 2012-08-20 - GMF - In earlier versions of this log format, the client IP field is incorrectly called cs_host. This compensates for that by assuming that whenever cs_host is seen, it really means c_ip.
if (fieldname eq 'cs_host') then
fieldname = 'c_ip';
#echo("fieldname: " . fieldname);
# Get the log field type
string log_field_type = '';
if (fieldname eq 'cs_referer') then (
log_field_type = 'url';
);
else if (fieldname eq 'cs_uri_stem') then (
log_field_type = 'page';
);
else if (fieldname eq 'cs_user_agent') then (
log_field_type = 'agent';
);
else if ((fieldname eq 'c_ip') or (fieldname eq "cs_host")) then (
log_field_type = 'host';
);
# Create the log field (but not cs_request, which is really three log fields, created below.
if (fieldname eq "cs_request") then (
create_log_field('cs_method', '');
create_log_field('cs_uri_stem', 'page');
create_log_field('cs_protocol', '');
create_log_field_at_end(log_fields_at_end, 'file_type', 'file');
create_log_field_at_end(log_fields_at_end, 'screen_dimensions', '');
create_log_field_at_end(log_fields_at_end, 'screen_depth', '');
log.format.parsing_regular_expression .= '([^ ]+) ([^ ]+) ([^ ]+)';
);
else if (fieldname eq 'cs_referer') then (
create_log_field(fieldname, log_field_type);
create_log_field_at_end(log_fields_at_end, 'search_engine', '');
create_log_field_at_end(log_fields_at_end, 'search_phrase', '');
log.format.parsing_regular_expression .= '([^ ]+)';
);
else if (fieldname eq 'cs_user_agent') then (
create_log_field(fieldname, log_field_type);
create_log_field_at_end(log_fields_at_end, 'operating_system', '');
create_log_field_at_end(log_fields_at_end, 'web_browser', '');
create_log_field_at_end(log_fields_at_end, 'spider', '');
log.format.parsing_regular_expression .= '(.*)';
);
else if (fieldname eq "time") then (
create_log_field('date', '');
create_log_field('time', '');
log.format.parsing_regular_expression .= '[[]([^:]+):([^ ]+) [^]]+[]]';
);
else (
create_log_field(fieldname, log_field_type);
log.format.parsing_regular_expression .= '([^ ]+)';
);
# If there's another field after this one, add a space to the regular expression
if (length(fields) > 0) then
log.format.parsing_regular_expression .= ' ';
# If we're creating a profile, create the database fields too.
if (node_exists("volatile.creating_profile")) then (
# Handle localtime by creating date_time and derived database fields
if (fieldname eq "time") then (
create_database_field('date_time', 0, 0);
create_database_field('day_of_week', 0, 0);
create_database_field('hour_of_day', 0, 0);
); # if time
# Create derived fields for agent
else if (fieldname eq "cs_user_agent") then (
create_database_field('operating_system', 0, 0);
create_database_field('web_browser', 0, 0);
create_database_field('spider', 0, 0);
);
# Create database field cs_ip and derived field for client IP
else if ((fieldname eq "c_ip") or (fieldname eq "cs_host")) then (
create_database_field(fieldname, 0, 0);
create_database_field('location', 0, 0);
);
# Create database field cs_referer and derived fields for referrer
else if (fieldname eq "cs_referer") then (
create_database_field('cs_referer', 1, 9);
create_database_field('search_engine', 0, 0);
create_database_field('search_phrase', 0, 0);
);
# Create derived file type field
else if (fieldname eq "cs_request") then (
create_database_field('cs_method', 0, 0);
create_database_field('cs_uri_stem', 0, 9);
create_database_field('file_type', 0, 0);
create_database_field('screen_dimensions', 0, 0);
create_database_field('screen_depth', 0, 0);
create_database_field('cs_protocol', 0, 0);
);
# Don't create a database field for x_origin_fetch_size (is reporting required on this field?)
else if (fieldname eq "x_origin_fetch_size") then (
);
# Don't add a database field for numerical fields
else if (subnode_exists(numerical_fields, fieldname)) then (
debug_message("Not adding numerical field: " . fieldname . "\n");
);
# Create a normal database field
else
create_database_field(fieldname, 0, 0);
); # if creating profile
); # while another field
# Create any final log fields
node lfae;
foreach lfae log_fields_at_end (
#echo("Final log field creation: " . node_name(lfae));
create_log_field(node_name(lfae), @lfae{"type"});
);
# Don't parse the #Fields line as a data line
'reject';
); # if # Format
# Don't parse any other # lines as data lines
else if (starts_with(current_log_line(), '#')) then (
'reject';
);
`
# Log fields
log.fields = {
} # log.fields
# Database fields
database.fields = {
} # database.fields
# Get web browser, operating system, web browser, and spider information from the user-agent field.
log.parsing_filters.derive_from_user_agent = {
value = `
get_user_agent_info(cs_user_agent);
web_browser = volatile.web_browser;
operating_system = volatile.operating_system;
spider = volatile.spider;
`
requires_fields = {
cs_user_agent = true
}
} # derive_from_user_agent
# Log Filters
log.filters = {
remove_query = {
label = "$lang_admin.log_filters.remove_query_label"
comment = "$lang_admin.log_filters.remove_query_comment"
value = "if (contains(page, '?')) then page = substr(page, 0, index(page, '?') + 1) . '(parameters)';"
} # remove_query
suppress_cs_range = {
label = "$lang_admin.log_filters.suppress_field_label"
comment = "$lang_admin.log_filters.suppress_field_comment"
value = "cs_range = '[suppressed]'"
requires_fields = {
cs_range = true
}
} # suppress_cs_range
suppress_etag = {
label = "$lang_admin.log_filters.suppress_field_label"
comment = "$lang_admin.log_filters.suppress_field_comment"
value = "sc_etag = '[suppressed]'"
requires_fields = {
sc_etag = true
}
} # suppress_etag
mark_entry = {
label = '$lang_admin.log_filters.mark_entry_label'
comment = '$lang_admin.log_filters.mark_entry_comment'
value = 'accesses = 1;'
} # mark_entry
} # log.filters
log.field_options = {
sessions_page_field = "cs_uri_stem"
sessions_visitor_id_field = "cs_host"
sessions_event_field = "page_views"
} # log.field_options
database.numerical_fields = {
accesses = {
default = true
requires_log_field = false
entries_field = true
} # accesses
unique_client_ips = {
log_field = "cs_host"
type = "unique"
} # unique_client_ips
sc_bytes = {
type = "int"
integer_bits = 64
display_format_type = "bandwidth"
} # sc_bytes
cs_bytes = {
type = "int"
integer_bits = 64
display_format_type = "bandwidth"
} # cs_bytes
sc_bytes_content = {
type = "int"
integer_bits = 64
display_format_type = "bandwidth"
} # sc_bytes_content
# Removed per Juniper suggestion.
# sc_content_length = {
# type = "int"
# integer_bits = 64
# display_format_type = "bandwidth"
# } # sc_content_length
time_taken = {
type = "int"
integer_bits = 64
display_format_type = duration_compact
} # time_taken
x_time_used_ms = {
type = "int"
integer_bits = 64
display_format_type = duration_milliseconds
} # x_time_used_ms
} # database.numerical_fields
create_profile_wizard_options = {
# How the reports should be grouped in the report menu
report_groups = {
date_time_group = ""
hit_type = ""
content_group = {
cs_uri_stem = true
file_type = true
x_namespace = true
cs_host = true
x_server = true
}
client_group = {
user = true
x_remote_user = true
}
visitor_demographics_group = {
hostname = true
domain_description = true
location = true
organization = true
isp = true
domain = true
authenticated_user = true
}
visitor_systems_group = {
screen_dimensions = true
screen_depth = true
web_browser = true
operating_system = true
}
referrer_group = {
cs_referer = true
search_engine = true
search_phrase = true
# search_phrase_by_search_engine = true
}
other_group = {
s_port = true
sc_status = true
worm = true
spider = true
server_domain = true
server_response = true
pragma_in = true
pragma_out = true
cache_control_in = true
cache_control_out = true
vary_out = true
x_cache_hit = true
protocol = true
operation = true
cs_method = true
cs_protocol = true
x_revalidate_cache = true
sc_substatus = true
}
} # report_groups
snapons = {
# Attach a page_views field
page_views = {
snapon = "page_views"
name = "page_views"
label = "$lang_admin.snapons.page_views.label"
prompt_to_attach = true
prompt_to_attach_default = true
parameters = {
file_type_field.parameter_value = "file_type"
server_response_field.parameter_value = "sc_status"
page_field.parameter_value = "cs_uri_stem"
page_views_field_name.final_node_name = "page_views"
page_views_field_name.parameter_value = "{=capitalize(lang_stats.field_labels.page_views)=}"
} # parameters
requires_log_fields = {
cs_uri_stem = true
sc_status = true
}
} # page_views
# Add the standard reports
add_standard_reports = {
name = "add_standard_reports"
label = "add_standard_reports"
snapon = "add_standard_reports"
} # add_standard_reports
} # snapons
} # create_profile_wizard_options
} # media_flow_controller_w3c