db = { parenthesizedomitted = { label = "Parenthesized Items Omitted" question = "There's a line above some of the tables in the statistics that says, \"parenthesized items omitted.\" What does that mean?" short_answer = "It means that some items (probably useless ones) have been omitted from the table to make the information more useful--you can show them by choosing \"show parenthesized items\" from the Options menu." long_answer = "
$PRODUCT_NAME omits parenthesized items (i.e. any item that starts with \"(\" and ends with \")\" from some tables to make the information more useful. For instance, most hits on a web site do not come directly from a search engine, (some come from links in other pages on the site, and others come from links on web sites that are not search engines), so usually the largest item in the search engines table would be on the item called \"(no search engine).\" Because hits from non-search-engines are not important in the search engines table, and because they dominate the numbers, making it difficult to compare \"real\" search engines, this item is omitted by default from the table. The way $PRODUCT_NAME omits it is by omitting all parenthesized items. Other examples of parenthesized items include the \"(no search terms)\" item in the search terms table, and the \"(internal referrer)\" item in the referrers table.
If you want to see all the hits in these tables, you can turn on parenthesized items in the Table Options page.
" } no_referrer_reports = { label = "Referrer Reports Missing" question = "My log data contains referrer information, but I don't see referrer reports, or search engines, or search phrases. Why not?" short_answer = "$PRODUCT_NAME includes referrer reports if the beginning of the log data includes referrers. If your log data starts without referrers, and adds it later, you won't see referrer reports. Create a new profile from the latest log file (with referrers), and change the log source to include all log data." long_answer = "When a profile is created, $PRODUCT_NAME looks at the first few lines of the log data when determining which fields are present, and which reports to generate. If it sees a referrer field there, it will create a Referrer report, and Search Engines and Search Phrases reports, and other referrer-related reports.
This can be a problem if the log data does not contain referrer data at the beginning of the dataset. For instance, IIS often default to minimal logging (without referrers), and Apache often defaults to logging in Common Access Log Format (without referrers). If you later reconfigure the server to log referrers, $PRODUCT_NAME still won't know that, because the beginning of the log data does not contain referrers, and that's where it looks. So a profile created from the whole dataset will not report referrers, even though the later data contains referrer information.
The solution is to recreate the profile, and when it asks you where the log data is, point it to the most recent file. That file will certainly have referrer information at the beginning, so the referrer reports will be set up properly. After creating the profile, and before viewing reports or rebuilding the database, go to the Config for the profile and change the Log Source to include all your log data. Then view reports, and referrer reports will be included.
" } internalreferrers = { label = "Eliminating Internal Referrers" question = "Most of the referrers listed in the \"Top referrers\" view are from my own site. Why is that, and how can I eliminate referrers from my own site from the statistics?" short_answer = "These are \"internal referrers\"; they represent visitors going from one page of your site to another page of your site. You can eliminate them by modifying the default \"(internal referrer)\" log filter, changing http://www.mydomain.com/ in that filter to your web site URL." long_answer = "Referrers show which page a hit came from -- i.e. they show what page a visitor was on when they clicked the link that took them to your page. For most web sites, visitors arrive and then click through several pages before leaving, so most web log data has a lot of referrers that are pages on the site being analyzed. For instance, if someone visits http://www.yoursite.com/index.html, and then clicks on a link pointing to http://www.yoursite.com/page2.html, it will show up in the log data (and in the statistics) as a referrer http://www.yoursite.com/index.html. These referrers are called an \"internal referrer,\" and under normal circumstances, you don't really care about them-- what you really want to know is which referrers brought traffic to your site, not what the referrers were once they got there.
$PRODUCT_NAME can't distinguish internal referrers from external referrers because it doesn't know your site's URL. So it doesn't know if a referral from http://www.yoursite.com/index.html is internal (which it is if your site is yoursite.com), or external (which it is if your site is anything else). To help $PRODUCT_NAME identify and hide internal referrers, you need to modify a log filter that $PRODUCT_NAME creates for you. Here's how:
Go to the Config section of your profile.
Click Log Filters.
Edit the log filter which sets referrers from \"yoursite.com\" to \"(internal referrer)\"
Replace \"yoursite.com\" with your actual site name, in that log filter.
Rebuild the database.
Once you've done that, the internal referrers will be suppressed in the \"Top referrers\" view (or they will appear as \"(internal referrer)\" if you've turned on parenthesized items).
" } trialdifference = { label = "Difference Between Trial and Full" question = "What's the difference between the full version of $PRODUCT_NAME and the Trial version?" short_answer = "The Trial version is identical to the full version, except that it expires after 30 days." long_answer = "$PRODUCT_NAME Trial is a free trial version, intended to let you evaluate the program without having to buy it. It is identical to the full version, except that it expires 30 days after it is first used. After the trial period is over, the trial version will no longer work, but it can be unlocked by purchasing a license, and all settings, profiles, and databases will remain intact." } tierdifferences = { label = "Difference Between Enterprise and Professional" question = "What's the difference between $PRODUCT_NAME Enterprise and $PRODUCT_NAME Professional?" short_answer = "Enterprise supports MySQL, RBAC, multithreaded database builds, real-time reporting, and full interface customization." long_answer = "
$PRODUCT_NAME Enterprise is intended for large organizations with very large datasets and advanced customization needs.
$PRODUCT_NAME Enterprise has all the features of $PRODUCT_NAME Professional, and the following additional features.
MySQL Server Support. Support for MySQL as a back-end database. This allows the data collected by $PRODUCT_NAME to be queried externally, and provides much greater scalability through the use of multi-computer database clusters.
Role-based Authentication (RBAC). The Enterprise version has the capability of using RBAC to the fullest exent, with full customization of roles related to users, with multiple roles assigned in any configuration.
Oracle Database Support. Only the Enterprise version has support for Oracle, and can be configured to gather information directly from the database.
Real-time Reporting. Enterprise has the capability to process data as needed, in real-time, to give you up to the minute reports.
Interface customization. The web interface for $PRODUCT_NAME is written entirely in its internal language (somewhat similar to perl). With Enterprise licensing, these files can be edited, providing complete customization of the entire user interface, both administrative and non-administrative.
Log files are large, ugly text files generated by web servers, proxy server, ftp servers, and just about every other kind of server. Every time something happens on the server (it serves a file, or delivers a message, or someone logs in, or something else), the server logs that information to the file, which continues to grow as new events occur. Log files are not particularly human-readable, and do not generally contain summarizing information, which is why $PRODUCT_NAME exists -- $PRODUCT_NAME processes your log files, summarizes them and analyzes them in many ways, and reports it back to you in a much friendlier format-- graphs, tables, etc.
You need to have access to your log files to use $(PRODUCT_NAME). If you don't have log files, $PRODUCT_NAME can't do anything for you. If you don't know where your log files are, ask your server administrator (hint: they are often stored in a directory called \"logs\"). In some cases, servers are configured so they do not keep log files, or the logs are hidden from users; in these situations, you will not be able to use $(PRODUCT_NAME). Again, your server administrator can help you find your log files, or they can tell you why they're not available. If you're trying to analyze a web site, and your ISP does not provide logs for you, you may want to consider switching to one that does.
" } platforms = { label = "Available Platforms" question = "What platforms does $PRODUCT_NAME run on?" short_answer = "Microsoft Windows 7/8/Vista/XP/2003/2008/2012, Mac OS X, most versions and variants of UNIX." long_answer = "$PRODUCT_NAME runs on Microsoft Windows Server 2003, Windows Server 2008, Windows 2012, Windows XP, Windows Vista, Windows 7, Windows 8, Mac OS X and most popular flavors of UNIX (Linux, Solaris, FreeBSD, OpenBSD, NetBSD, BSD/OS, Tru64 UNIX (Digital Unix), IRIX, HP/UX, AIX, OS/2, and BeOS). It is expected to also remain compatible with future versions of Windows and Mac OS X. Binary versions are available for the most popular platforms; on less common platforms, it may be necessary to build $PRODUCT_NAME yourself from the source code (which is available for download in encrypted/obfuscated format).That's just the server; once you have the server running, you can configure $PRODUCT_NAME, generate reports, and browse reports from any computer, using a normal web browser.
" } systemrequirements = { label = "System Requirements" question = "How much memory, CPU power, and disk space do I need to run $PRODUCT_NAME?" short_answer = "At least 2GB RAM, 4 GB preferred; 500 MB disk space for an average database; and as much CPU power as you can get." long_answer = "$PRODUCT_NAME is a heavy-duty number crunching program, and can use large amounts of memory, CPU, and disk. You have some control over how much it uses of each, but it still requires a reasonably powerful computer to operate properly.
$PRODUCT_NAME uses around 100 MB of memory when it processes a small to medium size log file, and it can use considerably more for very large log files. The main memory usage factors are the \"item lists\", which are tables containing all the values for a particular field. If you have a field in your data, which is very complex, and has many unique values (the URL query field for web log data is a common example of this), the item list can be very large, requiring hundreds of megabytes of memory. This memory is mapped to disk to minimize physical RAM usage, but still contributes to the total virtual memory usage by $PRODUCT_NAME. So for database with very complex fields, large amounts of RAM will be required. For large datasets, it is possible for $PRODUCT_NAME to use more than 2GB of address space, exceeding the capabilities of a 32-bit system; in this situation, it is necessary to use a 64-bit system, or a MySQL database, or both (see {=docs_faq_link('dbmemory')=} and {=docs_faq_link('memoryusage')=}). This typically will not occur with a dataset smaller than 10 GB, and if it often possible to process a much larger dataset on a 32-bit system with 2GB. A dataset over 20 GB will often run across this issue, however, so a 64-bit system is recommended for very large datasets. A large dataset is defined as 10 GB or more. A multi-core 64-bit CPU coupled with a 64-bit operating system and at least 2 GB RAM PER CORE (e.g. 8 GB recommended for a 4-core system) is highly recommended, if not required for datasets larger than 10 GB of log data. If your system cannot support the RAM usage required by your dataset, you may need to use log filters to simplify the complex database fields.
The $PRODUCT_NAME installation itself takes less than 50 Meg of disk space, but the database it creates can take much more. A small database may be only a couple megabytes, but if you process a large amount of log data, or turn on a lot of cross-references and ask for a lot of detail, there's no limit to how large the database can get. In general, the database will be somewhere on the order of 200% to 300% the size of the uncompressed log data in it, perhaps as much as 400% in some cases. So if you're processing 100 GB of log data, you should have 200 GB to 400 GB of disk space free on your reporting system to hold the database. If you use an external (e.g. SQL) database, the database information will take very little space on the reporting system, but will take a comparable amount of space on the database server.
Disk speed is something else to consider also when designing a system to run $(PRODUCT_NAME). During log processing, $PRODUCT_NAME makes frequent use of the disk, and during statistics viewing it uses it even more. Many large memory buffers are mapped to disk, so a disk speed can have a very large impact on database performance, both for processing log data and querying the database. A fast disk will increase $PRODUCT_NAME's log processing time, and the responsiveness of the statistics. SCSI is better than IDE, and SCSI RAID is best of all.
During log processing, especially while building cross-reference tables, the CPU is usually the bottleneck -- $PRODUCT_NAME's number crunching takes more time than any other aspect of log processing, so the rest of the system ends up waiting on the CPU most of the time. This means that any improvement in CPU speed will result in a direct improvement in log processing speed. $PRODUCT_NAME can run on any system, but the more CPU power you can give it, the better. Large CPU caches also significantly boost $PRODUCT_NAME's performance, by a factor of 2x or 3x in some cases.
" } dbmemory = { label = "Database Memory Usage" question = "I get an error 'Unable to allocate N bytes of memory' while building a database, and $PRODUCT_NAME seems to have used all my available memory. What can I do about it?" short_answer = "Use a 64-bit computer and operating system with sufficient RAM, and/or simplify your database" long_answer = `This error means that $PRODUCT_NAME tried to allocate another chunk of memory (N additional bytes, on top of whatever it was already using), and the operating system told it that there was no more memory available for it to use. This error is usually not a bug; it almost always indicated that $PRODUCT_NAME really has exhausted all memory available. This error typically happens when using the "internal" database with a very large dataset.
The "internal" database is optimized for performance above all, and tends to keep some key data structures in memory. On 32-bit systems, when processing large datasets, the amount of memory required may exceed the available address space. Typically, the internal database will work well up to about 10 GB of uncompressed log data on a 32-bit system. Above that, scalability may become an issue. On 64-bit systems, the address space is not a concern, but if there is not sufficient physical RAM, this error can still occur.
Itemnum tables, especially, can result in heavy memory usage for large datasets. Itemnum tables, or normalization tables, are typically kept in memory. $PRODUCT_NAME keeps a list of all values seen for each field, e.g., a list of all IP addresses which appear in a particular field, or a list of all URLs which appear in another field, in the "itemnum" tables. These tables are kept in memory, or at least mapped to memory, so they use available memory addressing space. In the case of an IP address field, for instance the source IP address of a web server log, each value is about ten bytes long. If there are 10 million unique IPs accessing the site, this table is 100 million bytes long, or 100 MB. Similarly for a proxy log analysis, if each unique URL is 100 bytes long and there are 10 million unique URLs in the log data, the table will be 1 GB. Tables this large can easily exceed the capabilities of a 32-bit system, which typically allows only 2 GB of memory to be used per process.
One solution is to use a 64-bit system and operating system, with sufficent RAM; with a 64-bit processor, $PRODUCT_NAME will be able to allocate as much RAM as it needs, provided the RAM is available on the system (and it can use virtual memory if it isn't). This is the most complete solution; with a large amount of RAM on a 64-bit system, it should be possible to build extraordinarily huge databases without running out of memory.
Another option is to simplify the dataset; see {=docs_chapter_link('resources')=} for suggestions. In particular, adding a lot filter to simply or eliminate very complex database fields can not only reduce memory usage, but also improve performance.
For an estimate of how much RAM you may need, see {=docs_chapter_link('server_sizing')=}.
` } winsock2 = { label = "Winsock 2" question = "When I run $PRODUCT_NAME on Windows, I get an error: \"A required DLL is missing: WS2_32.DLL.\" What's going on?" short_answer = "You need Winsock 2." long_answer = "To run on Windows 95, and some early versions of Windows 98, $PRODUCT_NAME requires Winsock2, a networking component available for free from Microsoft. You can download Winsock2 from here.Winsock2 is already part of Windows 98 (newer versions), Windows NT 4.0, and Windows 2000, so you do not need to download this component unless you are using Windows 95 or an old version of Windows 98.
" } libstdcppmissing = { label = "libstdc++ missing" question = "When I run $PRODUCT_NAME, I get an error: './sawmill: error while loading shared libraries: libstdc++.so.5: cannot open shared object file: No such file or directory'. What's going on?" short_answer = "$PRODUCT_NAME requires the libstdc++ library. This is available by default on many platforms, and is included in the $PRODUCT_NAME distribution on others (including Solaris)" long_answer = "$PRODUCT_NAME requires the libstdc++ library. This is available by default on many platforms, but it is not available on some older platforms, and it is often not available on Solaris. There are several ways of making this available:
Install the g++ compiler. This is available for all platforms from GNU. g++ is also available as a package (e.g. a Red Hat RPM) for most platforms, and is available as an installation option on most platforms. libstdc++ is part of the g++ compiler, so installing it will install libstdc++.
Use the libstdc++ included with $PRODUCT_NAME. On Solaris, the standard download of $PRODUCT_NAME includes the libstdc++ file (whose name starts with libstd++). If you have root access, the easiest way to install this is to copy it to /usr/lib. If you don't, you can set the environment variable LD_LIBRARY_PATH to point to your $PRODUCT_NAME installation. For instance, if your $PRODUCT_NAME installation is at /usr/sawmill, you can run this:
setenv LD_LIBRARY_PATH \"\\$LD_LIBRARY_PATH:/usr/sawmill\" export LD_LIBRARY_PATH=\"\\$LD_LIBRARY_PATH:/usr/sawmill\"
to add /usr/sawmill to the end of the LD_LIBRARY_PATH variable. You'll only need one of these two commands (the first if you're using csh as your shell, and the second if you're using bash), but it won't hurt to run them both if you're not sure which to use; you'll just get a harmless error message from the wrong one.
These commands will last for one command-line session. If you need to make this change permanent, you can add the pathname to a separate line in the /etc/ld.conf file, or you can add the command above to your one of your login scripts (e.g. .login, .cshrc, .bashrc).
After setting LD_LIBRARY_PATH, you should be able to run $PRODUCT_NAME
Find an existing libdstdc++ on your system. It is possible that you do have libstdc++ installed on your system, but it's not in your LD_LIBRARY_PATH. If that's the case, you can add the location of libstdc++ to the LD_LIBRARY_PATH using the instructions above. For instance, if it is in /usr/local/lib, you can add that to LD_LIBRARY_PATH to use it.
This DLL is part of Microsoft Internet Explorer. It is also included in many recent versions of Windows. If you see this error, download and install the latest Internet Explorer, and the problem should go away.
" } dnsproblems = { label = "Problems With DNS Lookup" question = "$PRODUCT_NAME only shows me the IP addresses of my visitors, even when I turn on DNS lookup. Why?" short_answer = "Try deleting the IPNumbersCache file in LogAnalysisInfo -- see the long answer for other solutions." long_answer = "(See {=docs_faq_link('dnslookup') =} for information about reverse DNS lookup).
Usually, this occurs because the DNS server can't resolve the IPs. The DNS server you're using needs to know about the IPs you're resolving. For instance, you can't use an external DNS server to resolve internal IP addresses, unless the external DNS server knows about them. Try using an internal DNS server, or another DNS server, if the first DNS server you try can't seem to resolve the IPs. It's useful to manually query the DNS server to see if it can resolve a particular IP; on most operating systems, this can be done with the \"dnslookup\" command.
" } noimagescgi = { label = "No Images in CGI Mode" question = "I run $PRODUCT_NAME in CGI mode, and all the images in the menus and the reports are missing or broken. Why?" short_answer = "You may have set the \"temporary $lang_stats.directory\" incorrectly during installation. Try deleting the preferences.cfg file in LogAnalysisInfo, and access $PRODUCT_NAME to try again." long_answer = "When $PRODUCT_NAME runs as a CGI program, it includes images in its pages by creating them in a temporary $lang_stats.directory in the web server $lang_stats.directory, and then embedding links in the HTML so that the images it created are served by the web server. This is done by selecting a \"temporary $lang_stats.directory\" and \"temporary $lang_stats.directory URL\" which point to a $lang_stats.directory inside the web server's root $(lang_stats.directory). They both point at the same $lang_stats.directory, but one of them is the pathname of the $lang_stats.directory, and one of them is the URL of the $(lang_stats.directory). These two must point at the same $lang_stats.directory for images to appear in the pages generated by $PRODUCT_NAME in CGI mode. If images are not appearing, it is usually because this is set incorrectly.
To correct the temporary $lang_stats.directory, delete the preferences.cfg file in the LogAnalysisInfo folder, and access $(PRODUCT_NAME). You will be prompted to enter the pathname and URL of the the temporary $(lang_stats.directory). Make sure you see the logo on the page after you enter the temporary $lang_stats.directory -- if the logo does not appear, click your browser's Back button and try again until you see the logo. If the logo does not appear, no other images in the $PRODUCT_NAME interface will either.
" } cantaccessserver = { label = "Can't Access the Server" question = "When I run $PRODUCT_NAME, it tells me that the server is started (it shows me the URL), but when I try to access that URL, the browser says it's not available. How can I fix this?" short_answer = "You may be using a proxy server which prevents you from accessing a server running on your own machine. Try reconfiguring the proxy to allow it, or try running $PRODUCT_NAME on IP 127.0.0.1 (the loopback interface)." long_answer = "If you're running Windows 2003 and using Internet Explorer, look at {=docs_faq_link('w2003_ie_lockdown')=} first, and return here if that doesn't help.
When you first start $PRODUCT_NAME in web server mode, it tries to start a web server, running on the local machine, using port 8988. If this fails, it should give you an error message; if it succeed, it should give you a URL. If you're seeing a URL when you start $PRODUCT_NAME, it generally means that the $PRODUCT_NAME server started successfully, and is ready to answer web browser requests.
Sometimes, though, when you actually try to access that URL, you may find that the server doesn't answer. Your browser may tell you that there's a DNS error, or that it couldn't contact the server, or that there's some other kind of error. If $PRODUCT_NAME displayed a URL, the server itself is probably working fine-- the problem is not with the server, but with the network connection to the server. This can happen, for instance, if you're using a web server proxy or cache server, and it doesn't know about the IP address of your own machine. When you contact the cache and ask to connect to your own machine, it gets confused, because normal web requests come from inside machines contacting outside machines, and this one is an inside machine contacting another inside machine (itself). A well-configured proxy server can handle this, but one that is not configured to handle internal requests may attempt to get the URL from the outside, and may give an error when it doesn't find it there. Some proxies/caches/firewalls will also refuse to let through traffic on port 8988 ($PRODUCT_NAME's default port), regardless of other settings.
There are several solutions. One choice is to reconfigure the proxy or cache server to allow HTTP connections from internal machines to other internal machines, on port 8988. Then $PRODUCT_NAME will be able to operate in its preferred mode, on port 8988 of the machine's first IP address.
If that's not an option, you may be able to get $PRODUCT_NAME to work by running it on the loopback interface (IP 127.0.0.1), or on port 80 (the standard web server port). The easiest way to find a working solution is to use the command-line interface to $PRODUCT_NAME, at least until you have it working; you can go back to using the graphical version later. From the command line, run $PRODUCT_NAME like this:
$PRODUCT_EXECUTABLE -ws t -sh 127.0.0.1 -wsp 80
This will attempt to start $PRODUCT_NAME's web server on IP 127.0.0.1 (the loopback interface), using port 80. This will only work if there is not a web server already running on the system-- only one server can use port 80 at a time. If you already have a web server running, use port 8988 instead. Try the command above with different IP addresses (127.0.0.1, and any IP addresses you know belong to your computer), and different ports (try 8988 first, then 80). With a little luck one of the choices will start a server that you can connect to. Once you've got the $PRODUCT_NAME interface working in your web browser, you can set it to use that IP and port permanently in the Preferences, from the Administrative Menu. Once you've set the IP and port in the Preferences, you can quit the command-line $PRODUCT_NAME, and start using the graphical version, if you prefer.
If that still doesn't work, check if there is a firewall on your system or on your network, which is blocking traffic from your machine to itself, on port 8988. If there is, try disabling the firewall temporarily (or reconfigure it to allow the traffic), and see if it works then. If it works with the firewall disabled, and doesn't work with the firewall enabled, then the firewall is probably blocking the necessary traffic. You'll probably want to reconfigure the firewall to let the network traffic through on 8988.
If none of these work, and you have a web server running on your system there is always CGI mode. $PRODUCT_NAME can run under any running web server in CGI mode; if you can connect to the web server itself, you'll be able to use $PRODUCT_NAME by running $PRODUCT_NAME under your local server as a CGI program.
Finally, if you can't get $PRODUCT_NAME to work to your satisfaction, please contact $SUPPORT_EMAIL.
" } loginloop = { label = "Login Loops Back to Login" question = "When I try to log in to $PRODUCT_NAME, I get to the Admin page, but the next thing I click takes me back to the login page. Why?" short_answer = "Your browser isn't storing the cookie $PRODUCT_NAME needs to maintain the login, or something is blocking the browser from sending the cookie. Make sure cookies are on in the browser, firewalls aren't blocking cookies, and don't use Safari 1.2.1 or earlier as your browser." long_answer = `$PRODUCT_NAME uses web browser cookies to store your login information, which keeps you logged in. If the browser isn't passing the cookie back to $PRODUCT_NAME properly, $PRODUCT_NAME won't know you're logged in, and you'll keep getting the login screen.
To keep this from happening, make sure cookies are enabled in your web browser. If you want to be selective about who gets cookies, at least make sure that the hostname or IP where $PRODUCT_NAME is running is allowed to get cookies. If your browser differentiates "session cookies" from other cookies, all you need is session cookies.
Use an approved browser--some browsers don't handle cookies quite right. Approved browsers are Internet Explorer 6, Safari 1.2.2 or later, and Firefox. Others may work, but have not been verified. In particular Safari 1.2.1 and earlier does not handle cookies properly -- this is fixed in 1.2.2 and later.
` } commandlinelogsource = { label = "Using a Command-line Log Source" question = "Can $PRODUCT_NAME use scp, or sftp, or ssh, or https, to download log data? Can it uncompress tar, or arc, or sea, or hqx, etc.?" short_answer = "Not directly, but you can do it by using a command-line log source to run a command line, script, or program that does whatever is necessary to fetch the data, and prints it to $PRODUCT_NAME." long_answer = "$PRODUCT_NAME supports many different methods of acquiring log data, including direct access to local files, and FTP or HTTP access to remote files; it can also decompress the major compression formats on the fly, including zip, gzip, and bzip2. If you need to use a different method to fetch the log data, like scp, sftp, or ssh, or if you need to read the log data from a database, or if you need to uncompress, decode, or decrypt a format that is not directly supported by $PRODUCT_NAME, you can do it using a command-line log source.
Command-line log sources are very simple in concept. You give $PRODUCT_NAME a command line; it runs the command line whenever it needs to get the log data; the command, script or program you specify \"prints\: the log data (i.e. generates it to stdout, the standard command line output stream), and $PRODUCT_NAME reads the output of the command to get the log data. The provides you with unlimited flexibility in how you feed your data to $PRODUCT_NAME.
For instance, suppose $PRODUCT_NAME didn't support gzip for at (it does). Then you could
use the following (UNIX) command log source: /bin/gunzip -c /logs/mylog.gz
. Since the -c
flag tells gunzip to dump the output to stdout, $PRODUCT_NAME will read the log data directly from this
command, without needing to use its built-in gunzipper. More usefully, any decompression utility with
a similar flag can be used to allow $PRODUCT_NAME to read any compressed, archived, or encrypted log directly, even if it
doesn't know anything about the format.
Even if you don't have a program that will dump the data to stdout, you can still use this approach by writing a tiny script. Consider the following (UNIX) shell script which scp'd files from a remote server and feeds them to $PRODUCT_NAME:
scp user@host:/logs/mylog.txt /tmp/templog cat /tmp/templog rm /tmp/templog
This script copies a log file from a remote machine (securely, using scp), prints it to stdout using \"cat\", and deletes it when it's done. The same script with slight modifications, could copy multiple files, or use a different method than scp to fetch the files (like sftp).
A simpler (and better) example which does the same thing is this command:
scp -qC user@host:/logs/mylog.txt > /dev/stdout
This explicitly scps the files to stdout, which sends them straight into $PRODUCT_NAME without the intermediate step of being stored on the disk or deleted. Since it's just one line, there's no need to use a script at all; this single line can be the command for the log source.
" } memoryusage = { label = "$PRODUCT_NAME uses too much memory for builds/updates, and is slow to view" question = "When I build or update my database with $PRODUCT_NAME, it uses a huge amount of memory. Then, when I view statistics, it's very slow. What can I do about that?" short_answer = "Decrease the complexity of the database." long_answer = "The main portion of the database that uses memory are the \"item lists\". There is one list for each database field, and each list contains all the unique values for that field. If one of the fields in your database has many unique values, (millions) it can require a very large amount of memory to track. Simplifying the field can save memory.
To check which database field is the main culprit, look at the sizes of the files in the \"items\"
sub$lang_stats.directory, in the database $lang_stats.directory (in the Databases
$lang_stats.directory of the LogAnalysisInfo $lang_stats.directory).
For instance, if location
$lang_stats.directory is the largest,
at 500 Meg, then you know that the \"location\" database field is responsible for the largest part
of the memory usage.
When you've found the culprit, you need to reduce its memory usage. This is where you'll have to
make compromises and cuts. The simplest solution is to delete the database field, and stop tracking
and reporting on it. If that's not an option, you'll need to simplify the field in some way.
The key point here is that you are trying to reduce the number of unique field values that $PRODUCT_NAME
sees and tracks. The pool
file, which is usually the largest one, contains a back-to-back
list of all all field values that are used in the database; if you can reduce the number of possible field values
used by $PRODUCT_NAME, you will reduce the size of the file.
If the field is a hierarchical (like a pathname, hostname, date/time, or URL), you can simplify it by tracking fewer levels, by adjusting the suppress_top and suppress_bottom values in the database.fields section of the profile .cfg file (in the profiles folder of the LogAnalysisInfo folder). For instance, the page field of web logs is tracked nine directories deep by default; you can simplify it by tracking only the top three levels directories. If your date/time field is set to track information to the level of minutes, you can change it back to tracking hours or days only. Usually, you will want to turn off bottom-level items checkbox for the field, since it's usually the bottom level that has all the detail.
Another possibility is to use a Log Filter to simplify the field. The default filter for web logs which replaces everything after ? with \"(parameters)\" is an example of this. By replacing all the various parameterized versions of a URL with a single version, this filter dramatically decreases the number of different page field values that $PRODUCT_NAME sees, therefore dramatically decreasing the memory usage of the \"page\" field. Similarly, if you have a very complex section of your directory structure, but you don't really need to know all the details, you can use a Log Filter to delete the details from your field, collapsing the entire structure into a few items.
A common source of high memory usage is a fully-tracked hostname/IP field. By default, $PRODUCT_NAME tracks only the first two levels of hostnames for web and proxy logs; i.e. it will tell you that a hit came from .sawmill.net, but not that it came from some.maching.sawmill.net. Because of the tremendous number of IP addresses that appear in large log files, this field can be a problem if it's set to track individual IPs (there's a checkmark that lets you do this when you create the profile). If this is happening, consider tracking only a few levels of the hostname hierarchy, instead of the the full IP address.
Of course, sometimes you really need the full detail you're tracking in a very large field. If you can't reduce the detail, and you can't reduce the amount of log data, then the only solution is to get enough memory and processing power to efficiently handle the data you're asking $PRODUCT_NAME to track.
" } iiscgitimeout = { label = "IIS CGI Timeout" question = "When I run $PRODUCT_NAME as a CGI program under IIS, I get an error message \"CGI Timeout: The specified CGI application exceeded the allowed time for processing. The server has deleted the process.\" What can I do about that?" short_answer = "Set the IIS CGI timeout to a high value, like 999999." long_answer = "Microsoft Internet Information Server (IIS) automatically terminates CGI programs that run for more than five minutes. Unfortunately, $PRODUCT_NAME can easily use that much when building a database, and if IIS terminates it, it may leave the database partly built and unusable. The solution is to reconfigure the IIS server to increase the CGI timeout to a much larger value. Here's how (instructions are for Windows 2000 Server; other Windows variants may be slightly different):
In the Start Menu, go the Settings menu, and choose Control Panels.
Open the Administrative Tools control panel.
Open the Internet Services Manager item.
Right-click on the computer icon in the left panel and choose Properties from the menu that appears.
Click \"Edit...\" next to \"WWW Services\".
Click the \"Home Directory\" tab.
Click the \"Profile...\" button.
Click the \"Process Options\" tab.
Enter a large value in the CGI script timeout field, perhaps 999999.
By default, $PRODUCT_NAME binds to all available IPs, so if there's an IP address where it is allowed to listen on port 8988, it already is (it's also listening on 127.0.0.1).
If you want it to listen only on the IP you specifiy you can do it from the Preferences. Go to the Preferences, click on the Network category, change the \"Server hostname\" option to the IP address you want to use, and change the \"Web server port\" option to the port number you want to use. The next time you start $PRODUCT_NAME, it will automatically bind to the IP address you specified.
If you're using the command-line version of $PRODUCT_NAME ($PRODUCT_EXECUTABLE_DOCS), you can either do the same as above, or you can give $PRODUCT_NAME command line options to tell it which IP number and port to use:
$PRODUCT_EXECUTABLE_DOCS -ws t -sh 128.129.130.131 -wsp 8888
When you use these options, $PRODUCT_NAME will immediately start up its web server on the port you specify.
" } windowsservice = { label = "Running $PRODUCT_NAME as a Service" question = "Can I run $PRODUCT_NAME as a Service on Windows? Can I run $PRODUCT_NAME while I'm logged out?" short_answer = "As of version 8, $PRODUCT_NAME is installed as a service when you run the normal installer." long_answer = "Earlier versions of $PRODUCT_NAME required extra steps to run them as a service, but this is no longer a problem-- the normal Windows installer automatically installs $PRODUCT_NAME as a service when you run it." } remoteadmin = { label = "Remote Administration" question = "My web site is hosted in another state. Does $PRODUCT_NAME provide browser based admin tools I can use to configure it and retrieve reports?" short_answer = "Yes, $PRODUCT_NAME's interface is entirely browser based." long_answer = "$PRODUCT_NAME's interface is entirely web browser based. $PRODUCT_NAME runs either as a stand-alone program (in which case it uses its own built-in web server to serve its interface), or as a CGI program (in which case it uses the normal web server on the machine). In either case, $PRODUCT_NAME is configured by running a web browser on any machine you choose, and accessing $PRODUCT_NAME as though it were a web site. Statistics are also served through a web browser interface. You do not need to be physically present at the server to configure it or to view reports; all you need is a web browser.
" } resettrial = { label = "Resetting the Trial Period" question = "My 30-day trial has expired, and I haven't finished evaluating $PRODUCT_NAME yet. How can I get a new trial?" short_answer = "Go to the Licensing page, delete your expired license, and click \"Try $PRODUCT_NAME For 30 Days.\"" long_answer = "$PRODUCT_NAME's trial license allows you to use it for evaluation purposes only. However, if after 30 days you still have not had a chance to fully evaluate $PRODUCT_NAME, you can extend your trial for another 30 days by doing the following:
Go to the Licensing page.
Delete your current trial license.
Click the \"Try $PRODUCT_NAME for 30 Days\" button.
This will work only once -- after that, you will need to contact us at $SUPPORT_EMAIL if you want to extend your trial period further.
" } resetpassword = { label = "Resetting the Administrative Password" question = "I've forgotten the password I chose for $PRODUCT_NAME when I first installed; how can I reset it?" short_answer = "As of version 8.0.2, there is a custom action reset_root_admin." long_answer = "For security reasons, $PRODUCT_NAME requires an administrative username and password whenever you use it (otherwise, anyone could use it to access your computer, since $PRODUCT_NAME is normally accessible by anyone on your network). You choose this username and password when you first run $PRODUCT_NAME, and it asks you for it whenever you run it again.
In version 7 we simply deleted users.cfg and prompted for a new root admin username and password. Though this is very insecure in a multi-user environment when the Root Admin deletes users.cfg but delays to enter a new username and password for hours or days. In such a case every other user who tried to access $PRODUCT_NAME would be prompted to enter a new root admin username and password and would gain root admin access when doing so.
In version 8, as of 8.0.2, there is now a custom action, reset_root_admin. This is run from the command line like this:
$PRODUCT_EXECUTABLE_DOCS -a rra -u username -pw password
This command changes the root username and password to the values specified for username and password.
E.g., on Windows, from the Command Prompt:
c:\\ cd c:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8 $PRODUCT_EXECUTABLE_DOCS_WIN32 -a rra -u jane -pw mypassword
or on Macintosh or Linux/Unix, from the terminal (assuming $PRODUCT_NAME is installed in /Applications/$PRODUCT_NAME):
cd '/Applications/$PRODUCT_NAME' ./$PRODUCT_EXECUTABLE_DOCS -a rra -u jane -pw mypassword
This is even more secure than using a default/default users.cfg, because there is no longer even the possibility of an attacker repeatedly trying default/default in the hope of catching $PRODUCT_NAME between steps 2 and 4 of the original approach (below). The custom action approach also solves the problem of losing other users (and the root admin language), because nothing is changed in users.cfg other than the root admin username and password.
This action exists only in 8.0.2 or later. For users with 8.0.0, and you forgot the username or password you originally chose, you can reset your password but you must contact $PRODUCT_NAME support and we will give you a file to be placed in lang_stats.directory. This will delete all users from $PRODUCT_NAME. Once you have the new users.cfg, access $PRODUCT_NAME again through a web browser, and you will be prompted to choose a new administrative username and password.
" } commandlinebuild = { label = "Building a Database from the Command Line" question = "How do I build a database from the command line?" short_answer = "Run \"executable -p profilename -a bd\" from the command line window of your operating system." long_answer = `It is not necessary to use the web interface to build a database; you can use the command line. This is useful for debugging problems with profiles, or for building when the web interface is not available, e.g. from scripts. The exact method, and the exact command, depends on the platform; see below. See also Additional Notes For All Platforms.
To build a database from the command line, first open a command prompt window. One method to open a command prompt window (sometimes called a DOS window) is to click "start" in the windows task bar then click "run", enter "cmd" in the text box and hit return.
You will get a new window that will display something like this:
Microsoft Windows XP [Version 5.1.2600] (C) Copyright 1985-2001 Microsoft Corp. C:\\Documents and Settings\\username>
In the command prompt window you will need to move to the $PRODUCT_NAME installation directory using the "cd" command. $PRODUCT_NAME is installed by default to "C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8", to move to this directory type cd C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8 or whatever path you specified during installation.
C:\\Documents and Settings\\username >cd 'C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8\\ C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8>
To get a list of internal profile names type the command "$PRODUCT_EXECUTABLE_DOCS_WIN32 -a lp" at the command prompt. This will display a list of the internal profile names from which you can select the profile you want to build.
C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8>$PRODUCT_EXECUTABLE_DOCS_WIN32 -a lp $PRODUCT_NAME 8.0.0; Copyright (c) 2008 Flowerfire myprofile
To build you will run $PRODUCT_NAME with the "-p profilename -a bd" options. Replace profilename with the internal name of your profile from the list of internal profile names. The build command and related output are shown below. If you wanted to update your database you can run $PRODUCT_NAME with the -p profilename a ud options.
C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8>$PRODUCT_EXECUTABLE_DOCS_WIN32 -p myprofile -a bd $PRODUCT_NAME 8.0.0; Copyright (c) 2008 Flowerfire Reading log file: C:\\Apache [ ] 0.00% 00:00 Reading log file: C:\\Apache [- ] 3.16% 00:01 Reading log file: C:\\Apache [######- ] 33.33% 5000e 00:02 Building cross-reference table 4 (worm) [############# ] 66.67% 00:03 Building cross-reference table 12 (search_engine) [##############= ] 73.68% 00:04 Building cross-reference table 18 (server_response) [####################] 100.00% 00:05
To build a database from the command line, first open a terminal window. On Mac, you do this by selecting the Finder, navigating to the Applications folder, Utilities, and double clicking the Terminal application.
You will get a new window that will display something like this:
Last login: Mon Sep 1 10:46:44 on ttyp1 Welcome to Darwin! [host:~] user%
In the terminal window you will need to move to the $PRODUCT_NAME installation directory using the "cd" command. Typically $PRODUCT_NAME is located in "/Applications/$PRODUCT_NAME". If you installed $PRODUCT_NAME somewhere else, change the directory name in the command to match. To move to this directory type "cd /Applications/$PRODUCT_NAME":
[host:~] user% cd /Applications/$PRODUCT_NAME [host:/Applications/$PRODUCT_NAME] user%
To get a list of internal profile names type the command "./$PRODUCT_EXECUTABLE_DOCS -a lp" at the command prompt. This will display a list of the internal profile names from which you can select the profile you want to build.
[host:/Applications/$PRODUCT_NAME] user% ./$PRODUCT_EXECUTABLE_DOCS -a lp $PRODUCT_NAME 8.0.0; Copyright (c) 2008 Flowerfire myprofile
To build you will run $PRODUCT_NAME with the "-p profilename -a bd" options. Replace profilename with the internal name of your profile from the list of internal profile names. The build command and related output are shown below. If you wanted to update your database you can run $PRODUCT_NAME with the -p profilename a ud options.
[host:/Applications/$PRODUCT_NAME] user% ./$PRODUCT_EXECUTABLE_DOCS -p myprofile -a bd $PRODUCT_NAME 8.0.0; Copyright (c) 2008 Flowerfire Reading log file: /logs/Apache [ ] 0.00% 00:00 Reading log file: /logs/Apache [- ] 3.16% 00:01 Reading log file: /logs/Apache [######- ] 33.33% 5000e 00:02 Building cross-reference table 4 (worm) [############# ] 66.67% 00:03 Building cross-reference table 12 (search_engine) [##############= ] 73.68% 00:04 Building cross-reference table 18 (server_response) [####################] 100.00% 00:05
Follow the Mac OS X instructions, which are basically UNIX instructions (since Mac OS X is basically UNIX); change the directories to match the location where you installed $PRODUCT_NAME. The executable file usually ends with the version number on Linux/UNIX platforms, so you'll need to change references from "./$PRODUCT_EXECUTABLE_DOCS" to "./$PRODUCT_EXECUTABLE_DOCS-8.0.0" (or whatever the version is).
When the command completes, the database will be built. If there is an error, it will be displayed in the command line window.
To get debugging output from the build (not usually useful), you can set the SAWMILL_DEBUG environment variable to 1, before rebuilding the database with the command above. On Windows, you can set this variable with "set SAWMILL_DEBUG=1". On Mac or other operating systems, you can run "export SAWMILL_DEBUG=1" (if you're using the bash shell), or "setenv SAWMILL_DEBUG 1" (if you're using csh). If you're not sure which shell you're running, type them both; one will work (it will not give any response), and one will give an error, which you can ignore.
You can also use the -v option to get "verbose" output from the build. There are many -v options available, documented in the "Command-line output types" page of the technical manual ( http://www.sawmill.net/cgi-bin/sawmill8/docs/sawmill.cgi?dp=docs.option&option_name=command_line.verbose ). For very high detail (too slow for any significant build), add "-v egblpfdD" to the command line. If you add much debugging output, you may also want to add "| more" to the end of the command line to pipe the output to a pager, or to add "> out.txt" to the end of the command line to redirect the output to a file.
For more examples of command-line usage, run $PRODUCT_NAME from the command line with the --help option.
` } typicalsetup = { label = "Typical Usage Patterns" question = "How does a typical company use $PRODUCT_NAME; what does a typical $PRODUCT_NAME setup look like?" short_answer = "Installations vary from customer to customer--$PRODUCT_NAME provides enough flexibility to let you choose the model that works best for you." long_answer = "There are quite a lot of different \"models\" that different customers use. For web server analysis, it is common to have $PRODUCT_NAME running on the active web server, either stand-alone or in web server mode, accessing the growing log files directly; this works well as long as the dataset is not too large and the server is not too heavily loaded. For very large datasets, however, many customers have dedicated $PRODUCT_NAME machines, which pull the logs over the network from the server(s). Databases are generally updated regularly; it's common to have them updated in the middle of the night, every night, using the $PRODUCT_NAME Scheduler or an external scheduler like cron.
In terms of the database layout, some common models include:
A single database. Most customers use a single large database that contains all their data. This works well if you have a lot of disk space and a fast computer (or computers) to process your logs with, or if your log data is not too large. You can use $PRODUCT_NAME's normal filtering features to zoom in on particular parts of the data, but it's all stored in a single database. $PRODUCT_NAME has other features that can be used to limit certain users to certain parts of the database; this is particularly useful for ISPs who want to store all their customers' statistics in a single large database, but only let each customer access their own statistics.
A \"recent\" database and a long-term database. In cases where log data is fairly large (say, more than 10 Gigabytes), or where disk space and/or processing power is limited, some customers use two databases, one in detail for the recent data (updated and expired regularly to keep a moving 30-day data set, for instance), and the other less detailed for the long-term data (updated regularly but never expired). The two databases combined are much smaller than a single one would be because they use less overall information, so it takes less time to process the logs and to browse the database. This is often acceptable because fine detail is needed only for recent data.
A collection of specialized databases. Some customers use a collection of databases, one for each section of their statistics. This is particularly useful for log data in the multi-Terabyte range; a tightly-focused database (for instance, showing only hits on the past seven days on a particular directory of the site) is much smaller and faster than a large all-encompassing database. This is also useful if several log files of different types are being analyzed (for instance, an ISP might have one database to track bandwidth usage by its customers, another to track internal network traffic, another to track usage on its FTP site, and another to track hits on its own web site).
There are a lot of options, and there's no single best solution. You can try out different methods, and change them if they're not working for you. $PRODUCT_NAME provides you the flexibility to choose whatever's best for you.
" } competitivecomparison = { label = "$PRODUCT_NAME vs. The Competition" question = "How is $PRODUCT_NAME different from other log analysis tools?" short_answer = "Among other things, $PRODUCT_NAME does not generate static reports -- it generates dynamic, interlined reports." long_answer = "There are many areas in which $PRODUCT_NAME beats the competition, but one major one is that $PRODUCT_NAME's statistics are dynamic, and its statistics pages are interlinked. Most other log analysis programs are report-based -- you specify certain criteria (like, \"give me all hits on my web site on January 14, broken down by page\") and it generates a single report, and it's done. If you want more detail about something, it's not available, or it's only available if you reprocess the log data with different settings.
$PRODUCT_NAME generates HTML reports on the fly, and it supports zooming, filtering, and many other dynamic features. You can zoom in a certain directory, for instance, and then see the events for that directory broken down by date, or by IP, or by weekday, or in any other way you like. You can create arbitrary filters, for instance to zoom in on the events for a particular address on a particular day, or to see the search terms that were used from a particular search engine on a particular day, which found a particular page. $PRODUCT_NAME lets you navigate naturally and quickly through hierarchies like URLs, pages/directories, day/month/years, machine/subnets, and others.
Of course, there are many other features that set $PRODUCT_NAME apart from the competition-- see our web site for a complete list.
" } multiplesites = { label = "Statistics for Multiple Sites" question = "Can $PRODUCT_NAME generate separate analyses for all the web sites hosted on my server?" short_answer = "Yes, $PRODUCT_NAME includes a number of features for just this purpose." long_answer = "
Absolutely. This is one of our core design goals -- to make $PRODUCT_NAME a good choice for web hosting providers, ISPs, and others who serve multiple sites from a single server. $PRODUCT_NAME's profiles provide an excellent mechanism for generating different statistics for each customer or web site. If each site has its own log file(s), this is trivial; you can just make a profile that analyzes the appropriate log file. If all sites share a single log file, it's not much harder -- $PRODUCT_NAME's advanced filtering mechanism lets you easily ignore all log entries except those of interest to a particular web site.
The technique you use depends on your situation. In general, you will need to have a separate profile for each user (you can quickly create all of your profiles using the Create/Update Many Profiles feature). For maximum flexibility, each profile can have its own database, and each profile can be password-protected or secured in some other way, to prevent unauthorized users from accessing it. See {=docs_chapter_link('security')=} for a discussion of some of the ways profiles can be secured. If each profile has its own database, then the log filters can be used to filter out all statistics except those belonging to the user.
If you don't care if users can access each others' statistics, you can use a single profile with a single database, and give each user a bookmark URL pointing to their statistics in the database; this is the simplest approach, but it makes it possible for one user to see another's statistics, which is usually undesirable.
Advantages of using a single database:
Faster log processing -- log data is read only once. This is particularly important when using an FTP log source with a log file containing the data for all profiles, because the log data will be fetched once per profile, so if you have 1000 profiles, this will use 1000 times more bandwidth. For local log files, this is not much if an issue, because $PRODUCT_NAME skips quickly over log entries it doesn't need, so it will only be spending real time on each log entry once.
Advantages of using multiple databases:
Smaller databases. Though $PRODUCT_NAME has to create many databases instead of one, generally the total disk usage will be smaller, because each database is tightly focused on its site, and does not need to keep around information that applies only to other sites. In one real-world example, the total database size shrunk by a factor of 200 when the customer switched from one database to many.
Faster statistics browsing. A small database is generally faster to browse than a large databases, so using multiple small databases will make the statistics faster.
More flexibility. Each profile can be configured separately, so you can have different cross-references, filters, database fields, etc. for different profiles. Using a single database locks you into a single database structure for all profiles.
In summary, you'll usually want to use multiple databases for multiple servers or sites. The main situation you'd want to use a single database for is if you're using FTP over a metered line to fetch the data; a single database will fetch it just once. Even then, though, you could set up an external script to fetch the log data to the local disk once, and then process it locally with $(PRODUCT_NAME).
" } filteringdomain = { label = "Filtering All but One Domain" question = "Can $PRODUCT_NAME generate statistics on just one domain, from a log file containing log data from many domains?" short_answer = "Yes. Add a log filter that rejects hits from all other domains." long_answer = "Yes. This can be done easily using a log filter. To do this, click Show Config in the profiles list, click Log Filters, and create a new log filter with this value:
Replace mydomain.com with the actual domain, and replace server_domain with the name of the log field which reports the server domain in your log data. Sometimes, this field is called cs_host. If there is no such field in your log data, then you'll need to use a different log format in order to filter by domain.
The next time you rebuild the database, all log entries from domains other than the one you entered will be rejected, leaving only statistics from the one domain.
" } filteringdirectory = { label = "Excluding a File or {=capitalize(lang_stats.directory)=}" question = "How can I remove a particular file or directory from the statistics?" short_answer = "Use a Log Filter to reject all hits on that file or directory." long_answer = "Create a new Log Filter to reject all hits on that file or directory. To do this, click Show Config in the profiles list, click Log Filters, and create a new log filter with this value:
The filter above rejects hits on the /robots.txt file. Or use this:
The filter above rejects all hits on the /somedir/ directory.
The next time you rebuild the database, all hits on that page or directory will be rejected, so they will not appear in the statistics.
By the way, the same technique can be use to filter hits based on any field, for instance all hits from a particular host or domain, or all hits from a particular referrer, or all hits from a particular authenticated user.
" } rejectspiders = { label = "Discarding hits from spiders" question = "How can I throw away all the spider hits, so I only see statistics on non-spider hits?" short_answer = "Use a Log Filter to reject all hits from spiders (and worms)." long_answer = "Create a new Filter to reject all hits from spiders. The easiest way to create log filters is in the Log Filter Editor, in the Log Filters section of the Config. To get to the Log Filters editor, click Show Config in the Profiles list (or click Config in the reports), then click Log Data down the left, then click Log Filters. To create the filter:
You can also use the Advanced Expression Syntax option from the Filter Type drop down list (on the Filter tab), and type in this filter expression into the value field:
if (spider ne \"(not a spider)\") then \"reject\";
Then rebuild your database, and all hits from spiders will be discarded.
For more details on Filters see {=docs_chapter_link('filters')=}.
" } gzippeddata = { label = "Processing zipped, gzipped, or bzipped Log Data" question = "Can $PRODUCT_NAME process ZIPped, gzipped, or bzipped log data?" short_answer = "Yes, all three." long_answer = "Yes. Any files that end with a .gz, .zip, .bz, or .bz2 will be treated as compressed files by $(PRODUCT_NAME). It will uncompress them \"on the fly\" (not modifying the original file and not creating any new files), and process their uncompressed data the same way it reads normal log files.
" } upgradingpreserving = { label = "Upgrading Without Losing Data" question = "How can I upgrade to a new version of $PRODUCT_NAME without losing my profiles, databases, and other data?" short_answer = "When upgrading 8.5.x to a newer 8.5.x on Windows, just install the new version on top of the old. When upgrading from an older 8.1.x version to a newer 8.5.x version on Windows, just install the new version on top of the old; when you next view the profiles, it will ask you to convert the older-format profiles and databases to the new. When upgrading 8.x to a newer 8.x on non-Windows, install the new and copy profiles, databases, etc. from the old LogAnalysisInfo to the new; if it's from 8.1.x to 8.5.x, it will prompt for conversion. When upgrading 7 to 8.5.x, use the Import link in the Admin menu." long_answer = "$PRODUCT_NAME 8.5.x can be installed directly on top of 8.1.x, or on top of an older 8.5.x. On Windows, just run the installer; it will simply install what's necessary and will not overwrite or remove your existing profiles, databases, or any user configuration data. Installation will not result in data loss. Once the install is complete, you are now ready to continue using $PRODUCT_NAME. If you're upgrading from 8.1.x to 8.5.x, $PRODUCT_NAME will detect your 8.1.x profiles, and will prompt you to convert them when you next view the Profiles list. Databases are converted at the same time as profiles, so you should copy the databases before you run the conversion.
If you're upgrading from an older 8.x to a newer 8.x on a non-Windows installation, start by installing/unpacking the new installation. Don't run it yet, though. In order to preserve profiles, settings, databases, and more, you need to copy them from the old LogAnalysisInfo $lang_stats.directory. Here are the parts you may want to copy:
Profiles. Copy the entire profiles folder in the LogAnalysisInfo $lang_stats.directory, to the new one.
Preferences. Copy preferences.cfg from your LogAnalysisInfo directory, to the new one.
Databases. Copy $lang_stats.directories from your existing LogAnalysisInfo $lang_stats.directory to the new one.
Schedules. Copy the file schedules.cfg from your existing LogAnalysisInfo $lang_stats.directory to the new one.
Users. Copy the file users.cfg from your existing LogAnalysisInfo $lang_stats.directory to the new one.
Licenses. Copy the file licenses.cfg from your existing LogAnalysisInfo $lang_stats.directory to the new one.
Startup Data. Copy the file system.cfg from your existing LogAnalysisInfo $lang_stats.directory to the new one.
Roles. Copy the files roles_enterprise.cfg and roles_standard.cfg from your existing LogAnalysisInfo $lang_stats.directory to the new one.
User Settings. Some per-user settings, including temporary modifications to report (e.g., showing more rows), and report filters, are in the users_cache $lang_stats.directory in LogAnalysisInfo. Copy these from the old installation to the new, to preserve these settings.
If you edited the graph colors file (LogAnalysisInfo/graph_colors.cfg) or the field categories file (LogAnalysisInfo/field_categories.cfg), copy those from the olg LogAnalysisInfo to the new also.
There is a perl script in Extras, update.pl, which does all these copies in a single step.
After these files are copied, you can start the server in the new installation, and all your data should be preserved. If you're upgrading from 8.1.x to 8.5.x, $PRODUCT_NAME will detect your 8.1.x profiles, and will prompt you to convert them when you next view the Profiles list. Databases are converted at the same time as profiles, so you should copy the databases before you run the conversion.
To upgrade a $PRODUCT_NAME 7 installation to $PRODUCT_NAME 8, install $PRODUCT_NAME 8 in a different location (don't install it over $PRODUCT_NAME 7!), and then in the Admin menu of $PRODUCT_NAME 8, choose Import. Choose the location of the $PRODUCT_NAME 7 LogAnalysisInfo folder (in the installation directory), and you will be prompted to import profiles, databases, and users from the $PRODUCT_NAME 7 installation. After the upgrade is complete and you have verified that all components were upgraded successfully, you can delete the old installation.
WARNING: Regardless of the upgrade version path, back up your existing installation before upgrading. The upgrade process is complex, and if it fails for any reason, it can result in the corruption of the profiles, databases, etc. Be sure you have a backup before upgrading.
" } exiturls = { label = "Tracking Exit URLs" question = "How can I tell where visitors went when they left the site?" short_answer = "Normally, you can't. However, you can set up \"reflector\" pages if you need this information." long_answer = "$PRODUCT_NAME can show you the last page visitors hit before they exited the site, but it cannot usually show you where they went. The reason is that when they click a link on your site leading to another site, their web browser contacts the other site (not your site) for the new page--your web server is not contacted at all when someone clicks a link to leave your site. So the hit appears in the remote site's log files, not yours, and $PRODUCT_NAME cannot report on it because it's not in your log files.
Nevertheless, you can track exits from your site if you're willing to set up \"reflector\" pages. A reflector page is a page whose sole purpose is to reflect a visitor to another page. This can be done with a trivial HTML page containing only a META RELOAD tag in the HEAD section. For instance, the following simple HTML page will cause a visitor to be immediately redirected to http://www.flowerfire.com:
<html> <head> <meta http-equiv=\"Refresh\" content=\"0; URL=http://www.flowerfire.com/\"> </head> </html>
By creating a page like this for every exit link on your site, and changing your links to point to the reflector page rather than the actual destination page, you can track exit link usage. When a visitor clicks the exit link, they will be taken to the reflector page, and then immediately reflected to the actual destination. This will happen quickly enough that they will not notice the reflection happening--it will seem to them that they went straight to the destination page. But your log data will include a hit on the reflector page, so you will be able to see which exit links are being taken. In the \"exit pages\" view, the reflector links will show which links were taken when leaving the site.
A more sophisticated way of doing this is to create a CGI script (or other type of script) which generates the reflector page on the fly, given a URL parameter. If you do it that way, you won't need to create a separate reflector page for each link; you can just use the same script for all your external links.
" } visitorsums = { label = "Visitor Totals Don't Add Up" question = "When I add up the number of visitors on each day of the month, and I compare it to the total visitors for the month, they're not equal. Why not? Also, why doesn't the sum of visitors on subpages/subdirectories add up to the total for the directory, and why doesn't the sum of visitors on subdomains add up to the total for the domain, etc.? Why are there dashes (-) for the visitor totals?" short_answer = "Because \"visitors\" is the number of unique visitors, a visitor who visits every day will show up as a single visitor in each day's visitors count, but also as a single visitor for the whole month -- not 30 visitors! Therefore, simple summation of visitor numbers gives meaningless results." long_answer = "We get this a lot as a bug report, but $PRODUCT_NAME is not counting visitors wrong. \"Visitors\" in $PRODUCT_NAME's terminology refers to unique visitors (see {=docs_faq_link('datatypes') =}). So:
The total hits in a month is equal to the sum of the hits on the days of the month
and
and the total bandwidth for a month is equal to the sum of the bandwidth on the days of the month
and
and the total page views for a month is equal to the sum of the page views for each day of the month
BUT
The total number of visitors in a month is not usually equal to the sum of the visitors on the days of the month.
Here's why. Suppose you have a web site where only one person ever visits it, but that person visits it every day. For every day of the month, you will have a single visitor. For the entire month, too, you will have a single visitor, because visitors are unique visitors, and there was only one visitor in the entire month, even though that visitor came back again and again. But in a 30-day month, the sum of the visitors per day will be 30, or one visitor per day. So though $PRODUCT_NAME will correctly report one visitor that month, it will also correctly report one visitor per day.
If what you're really looking for is \"visits\" rather than \"visitors\" (so each visit will count once, even if it's the same visitor coming back over and over), then that's what $PRODUCT_NAME calls \"sessions,\" and you can get information about them in the Sessions Summary and other session-related views (paths through the site, entry pages, exit pages, time spent per page).
In table reports, the total row is calculated by summing all other rows. Because visitors cannot be summed in this way, the visitors column in the total row will always be a dash (-).
" } dnslookup = { label = "Resolving IP Numbers" question = "When I look at the top hosts and top domains, all I see are numbers (IP addresses). How do I get the domain information?" short_answer = "Turn on reverse DNS lookup in the Network options (or in your web server), or use $PRODUCT_NAME's \"look up IP numbers using DNS\" feature." long_answer = "Your web server is tracking the IP numbers of visitors, but not their hostnames or domains. If you need hostname or domain information, you need to tell $PRODUCT_NAME (or your web server) to look up the IP addresses using DNS (domain name service). One way to do this is to turn on DNS lookup in your web server; that will slow down your server, but then $PRODUCT_NAME will report hostnames and domains without any performance penalty during log data processing.
If you're not willing to take the performance hit on your server, or if you want to analyze log data that has already been generated with IP addresses, you can turn on $PRODUCT_NAME's reverse DNS feature like this:
Log in to $PRODUCT_NAME.
Click \"Config Options\" for the profile you want to modify.
Click \"DNS Lookup, Support & Action Email\" in the menu.
Check the box labeled \"Look up IP numbers using domain nameserver (DNS)\".
Enter the hostnames or IP addresses of one or two DNS servers in the DNS server fields. You can get this information from your network administrator, or your ISP.
Click \"Save Changes\".
Rebuild the database (e.g. choose \"Build Database\" from the menu at the top).
Processing log data will be slower with reverse DNS turned on, but you will get full hostname and domain information.
If you have problems getting the DNS feature to resolve IP addresses, see {=docs_faq_link('dnsproblems') =}.
A third option is to use a separate DNS resolving program to compute your
log files after the server is done writing them, and before $PRODUCT_NAME analyzes them.
Examples include logresolve
, which is included with the popular Apache web server,
DNSTran, which runs on several platforms including
Macintosh, Linux, Solaris, and IRIX.
If you're using UNIX or MacOS X,
another good option is adns,
an asynchronous DNS lookup library that includes some command-line tools for
looking up IP addresses, including adnslogres
(for Common Access format
and Apache Combined format files) and adnsresfilter
(for other
types of log files). For instance, you can use the command \"adnsresfilter < /path/to/my/log.file\"
as your log source command to use adns. adns is faster than logresolve, but
more difficult to configure initially.
You can plug any command-line DNS resolver directly into $PRODUCT_NAME by using a command log source, and entering a UNIX command that resolves the IPs in the log file and dumps the resolved log data to the standard output stream, in this case
logresolve < /path/to/my/log.fileOnce you've done that, $PRODUCT_NAME will automatically run logresolve when you process your log data, and it will resolve the data before feeding it to $(PRODUCT_NAME)." } mappeddrivewithservice = { label = "Can't See Network Drives with $PRODUCT_NAME as Service" question = "Why can't $PRODUCT_NAME see my mapped drive, share, directory, or mount points when I run it as a Windows Service?" short_answer = "The Service must run with the same privileged user account that has the mapped drive, share, directory, or mount point privilege." long_answer = "The mapped drive, share, directory, or mount point is a permission issue that involves security. It is therefore necessary to have the service run using that same privileged account that the drive was originally mapped from, or an account which has permissions to access the share, etc. If the service cannot connect as the same user that has the privilege, the network resource will not be available.
Here is a step-by-step walkthrough on how to change the service logon permission:
Go to Control Panel
Open up Services (location varies slightly with particular OS version)
Find the $PRODUCT_NAME entry (or the entry for the service running which is being used to run $PRODUCT_NAME and right mouse click it.
Select Properties
Under the 'Log On' tab deselect the 'Local System Account' radio button by selecting 'This account' and hit the browse button
In the 'Select User' dialog box, you may type in the privileged user's UserID or you may also browse for it. Once you have selected the correct user, click the OK button and the 'This account' field will be populated by a period, then a back slash (\) then the users' ID
Enter the privileged user's password twice. This will show up as asterisks. This is for security reasons and by design
Back at the Control Panel properties for the $PRODUCT_NAME entry, right mouse click and select the 'restart' option.
When you next run $PRODUCT_NAME, access to the mapped drive, share, directory, or mount point will be available
The Windows 2003 security policies prevent programs like $PRODUCT_NAME from accessing network drives (mapped or UNC). In order to enable access to these drives, you need to do this:
Go to Control Panel
Open Administrative Tools
Click Local Security Policy
Click the Local Policies folder
Click the Security Options folder
Under Network Access, turn on "Let Everyone permissions apply to anonymous users."
Under Network Access, turn off "Restrict anonymous access to named pipes and shares."
Now Windows 2003 will let $PRODUCT_NAME see and access network drives.
` } # mappeddrive2003 w2003_ie_lockdown = { label = "Can't access server with Windows 2003 and IE" question = "On Windows 2003, I can't access the $PRODUCT_NAME server using Internet Explorer. Why not?" short_answer = "The \"Internet Explorer Enhanced Security Configuration\" may be enabled, blocking access; uninstall it or add 127.0.0.1:8988 to the trusted sites." long_answer = "Windows 2003 starts up with Internet Explorer \"locked down\" in a highly secure mode where only certain sites are accessible. In particular, $PRODUCT_NAME's default URL cannot be accessed by Internet Explorer.
To enable access to $PRODUCT_NAME from Internet Explorer, do this:
Now you should be able to access $PRODUCT_NAME with Internet Explorer.
Alternatively, use a different browser which does not restrict access.
Alternatively, go to the Add/Remove Programs control panel and uninstall \"Internet Explorer Enhanced Security Configuration\".
" } # w2003_ie_lockdown pageparameters = { label = "Page Parameters" question = "I use parameters on my pages (e.g. index.html?param1+param2), but $PRODUCT_NAME just shows \"index.html?(parameters).\" How can I see my page parameters?" short_answer = "Delete the Log Filter that converts the parameters to \"(parameters).\"" long_answer = "By default, $PRODUCT_NAME creates a log filter to convert everything after the ? in the page field to \"(parameters)\". In most cases that's best, because it reduces the size of the database significantly. But if you need the parameter information, it's easy to get it back--just delete that filter. You can do that like this:
Go to the Config section of your profile.
Click Log Filters.
If your log format is Apache or similar, find the log filter which replaces everything after \"?\" with \"(parameters)\", and delete or disable that log filter.
If your log format is IIS or similar, find the log filter which appends the cs_uri_query field to the cs_uri_stem field, and enable that log filter.
Rebuild the database.
When you view the reports, you'll see that \"(parameters)\" has now been replaced by actual parameters.
" } combinereferrers = { label = "Combining Referring Domains" question = "How can I combine referrers, so hits from http://search.yahoo.com, http://dir.yahoo.com, and http://google.yahoo.com are combined into a single entry?" short_answer = "Create a log filter converting all the hostnames to the same hostname." long_answer = "You can do this by converting all of the hostnames to a single hostname, so for instance they all appear as http://yahoo.com referrers. To do this, you need to convert all occurrences of /search.yahoo.com/, /dir.yahoo.com/, or /www.yahoo.com/ into /yahoo.com/, in the referrer field. The easiest way is to make three log filters, in the Log Filters section of the Config part of your profile:
Then rebuild the database; the resulting statistics will combine all three referrers in a single /yahoo.com/ referrer.
A more sophisticated filter is necessary if you need to preserve some parts of the URL while converting others. In that case, you can use a regular expression filter:
The way this works is it matches any referrer starting with http://us.fN.mail.yahoo.com/ym/ (where N is any integer), and while it's matching, it extracts everything after the /ym/ into the variable \$(1). The leading ^ ensures that the referrer starts with http://, the trailing \$ ensures that the parenthesized .* section contains all of the remainder after /ym/, [0-9]* matches any integer, and \\. matches a single period (see {=docs_chapter_link('regexp')=} for more information about regular expressions). If it matches, it sets the referrer field to http://us.f*.mail.yahoo.com/\$1, where \$1 is the value extracted from the original URL. This allows you to collapse all http://us.fN.mail.yahoo.com/ URLs into a single one without losing the extra data beyond /ym/. If you don't care about the data beyond /ym/, you can use somewhat simpler (or at least easier-to-understand) filter:
This one uses a wildcard comparison (if matches wildcard expression) rather than a regular expression, which allows the use of * in the expression in its more generally used meaning of \"match anything\". Note also that in the first line, * appears twice and each time matches anything, but in the second line it appears only once, and is a literal *, not a \"match-anything\" character.
" } clusteredservers = { label = "Clustered Servers" question = "Can $PRODUCT_NAME combine the logs from multiple clustered or load balanced web servers, so that the user has one view of the data? Can it report separately on the different servers?" short_answer = "Yes." long_answer = "$PRODUCT_NAME can read any number of log files, from any number of servers, into a single database to show a single aggregate set of reports of all the data. If the logs also contain information about which server handled each request (or if each server has a separate log file, or a set of separate log files), then $PRODUCT_NAME can also show per-server statistics, if desired. Unlike many log analysis tools, $PRODUCT_NAME does not care if the files are in order, or if their date ranges overlap -- any combinations of any number of files with data in any order are possible.
To see per-server statistics, look in the reports for a report which breaks down the overall events by server. This might be called \"Server domains\" or \"Server hosts\" or \"Server IPs\" or something else, depending on the log data. Click on a particular server in that report; that zooms you in on that server. Now choose any other report from the \"Default report on zoom\" dropdown menu, to see a breakdown of the statistics for that server only. Alternatively, you can use the global filters to zoom \"permanently\" on a particular server, and then all reports will automatically show numbers for that server only.
If you don't have a field that tracks the server, you may still be able to get per-server statistics, by using the current_log_pathname() function detect which server each hit came from. You'll need to create a custom field in that case, with a log field to track the server, a filter to compute the field from the log pathname, and a database field and report for the field. For information on creating custom fields, see {=docs_faq_link('custom_fields') =}.
" } logentryorder = { label = "Log Entry Ordering" question = "Does the log data I feed to $PRODUCT_NAME need to be in chronological order?" short_answer = "It depends on the format, but in most cases, the log data can be in any order." long_answer = `$PRODUCT_NAME usually doesn't care what order the log data is in. For most common formats, which have one event per line of log data, $PRODUCT_NAME will just read the log data in any order, and if a reordering is required for some analysis (like web server sessions), it will automatically sort it before doing the analysis. Similarly, when using multiprocessor parsing, $PRODUCT_NAME will split the log data into chunks to distribute to each parsing server, and may parse or import them out of chronological order, without this causing any problems for the reports.
However, there are exceptions. If a log format has dependencies between lines of log data, e.g., if a line of data refers to previous lines of data in any way, then it may be necessary to process the logs in order, to get consistent results. Otherwise, at the boundaries between blocks of log data, it may not be possible to interpret the meaning of the first few lines, which depend on lines from other blocks, which might not have been processed yet, or might be analyzed simultaneously in other threads.
Examples of this type of dependency are Postfix logs, and many other mail server logs, which log "recipient" events on separate lines from "sender" events; Wowza and Flash media server logs, which report incremental bandwidth on each line which must be compared to previous lines to determine actual bandwidth usage by that event; and any log format plug-in which logs events across multiple lines (there are many, but they tend to be less frequently analyzed formats). Examples of log formats not affected are most common formats, including all web servers, all firewall or proxy or gateway servers, and all media servers except Flash and Wowza.
This "boundary problem" is unavoidable to some degree, since every log dataset has at least two boundaries, at the first line of log data and the last one. But it is exacerbated by out-of-order log file processing, and multiprocessor parsing, both of which introduce additional boundaries into the analysis.
A typical analysis will have a small number of boundaries, relative to the number of "good" lines of log data, so this issue can usually be ignored. However, it may result in slight differences in reported numbers from one build to the next, of the same dataset, when using multiprocessor parsing. In rare cases, the differences can be large.
If the boundary problem needs to be eliminated in a profile, it can be mostly resolved by turning off multiprocessor parsing (with {=docs_option_link('log.processing.distributed.method')=}); this will eliminate all boundaries except those between files. If the intra-file boundaries are an issue (which can happen if the profile uses log filters to keep information from previous lines, and apply it to current lines), logs can be manually imported in chronological order, for instance by concatenating them to a single file and importing that file.
Database filters also provide a way of solving this problem in some cases. Since database filters, unlike log filters, operate on the database after it has been imported, and since they can sort the data before they operate, it is usually possible to process data in the required order, regardless of the log data order. The Sessions snap-on uses this technique to analyze the data chronologically, and in order of IP, without requiring the imported log data to be in any special order.
` } create_many_profiles = { label = "Creating many profiles in a batch" question = "How can I create many profiles in a batch, from a template?" short_answer = "Use thecreate_many_profiles
command-line option."
long_answer = `
To create many profiles in a batch, all based on a particular "template" profile, you can use the
create_many_profiles
command-line feature. To do that, start by editing the file
LogAnalysisInfo/miscellaneous/create_many_profiles.cfg file, using a text editor. Do the following:
Change template_profile_name
to the internal name of the profile you want to use as a template. The internal name is the name of the file in LogAnalysisInfo/profiles, but without the .cfg extension, so this might be:
template_profile_name = "my_profile"
Change "clone1 = {
" to the internal name of the first profile, the one you want to create:
derived_profile_1 = {
Change the label to the "human readable" name of the profile, e.g.:
label = "Derived Profile 1"
Make changes inside the changes
section of the clone1
group, to change any options that you want changed in the profile (whatever should be different from the template profile). One common change is the log source pathname; the following line changes the "pathname" option in the "0" group of the "source" group of the "log" group of the profile .cfg file; i.e., it changes the pathname of log source 0 (which is typically the first log source), so it looks for its log data in /logs/for/clone1:
log.source.0.pathname = "/logs/for/clone1"
Another common change is to add a filter to reject all but a certain class of events, in this profile's database; for instance, this rejects all hits in IIS logs where the page doesn't start with "/abc", resulting in a profile showing only hits on the "abc" directory of the web site:
log.filters.2 = "if (!starts_with(cs_uri_stem, '/abc')) then 'reject';"
Repeat this for as many profiles as you need, by duplicating the clone1
section for each profile, choosing a new internal name (replacing clone1
) and label (replacing "Clone 1") for each new profile.
Run $PRODUCT_NAME from the command line, for Windows:
$PRODUCT_EXECUTABLE_DOCS_WIN32 -dp templates.admin.profiles.create_many_profiles
or the following command line, for non-Windows:
$PRODUCT_EXECUTABLE_DOCS -dp templates.admin.profiles.create_many_profiles
This step will create all profiles.
At any time, you can recreate all profiles without affecting their databases. So by editing only the template
profile, and using create_many_profiles
to propagate changes to all clones, you can maintain all profiles with
only one template profile.
Security Enhanced Linux (SELinux) restricts what programs can do, to prevent them from misbehaving. The default behavior for an unrecognized program blocks certain operations that $PRODUCT_NAME needs to function, resulting in a blank screen when running $PRODUCT_NAME in CGI mode. This article describes how to lower the restrictions to allow $PRODUCT_NAME to work.
Start by creating a file called sawmill.te
, with the following contents:
module sawmill 1.0; require { class appletalk_socket create; class dir getattr; class dir read; class dir search; class dir { getattr read }; class dir { read search }; class file getattr; class file read; class netlink_route_socket bind; class netlink_route_socket create; class netlink_route_socket getattr; class netlink_route_socket nlmsg_read; class netlink_route_socket read; class netlink_route_socket write; class socket create; class socket ioctl; class udp_socket create; class udp_socket ioctl; class unix_dgram_socket create; role system_r; type apmd_log_t; type autofs_t; type boot_t; type faillog_t; type file_t; type httpd_log_t; type httpd_sys_script_t; type lastlog_t; type mnt_t; type net_conf_t; type proc_net_t; type rpm_log_t; type samba_log_t; type sendmail_log_t; type squid_log_t; type sysctl_net_t; type sysfs_t; type var_log_t; type var_t; type wtmp_t; }; allow httpd_sys_script_t apmd_log_t:file getattr; allow httpd_sys_script_t autofs_t:dir getattr; allow httpd_sys_script_t boot_t:dir getattr; allow httpd_sys_script_t faillog_t:file getattr; allow httpd_sys_script_t file_t:dir getattr; allow httpd_sys_script_t httpd_log_t:dir getattr; allow httpd_sys_script_t httpd_log_t:dir read; allow httpd_sys_script_t httpd_log_t:file read; allow httpd_sys_script_t lastlog_t:file getattr; allow httpd_sys_script_t mnt_t:dir getattr; allow httpd_sys_script_t net_conf_t:file getattr; allow httpd_sys_script_t net_conf_t:file read; allow httpd_sys_script_t proc_net_t:dir { read search }; allow httpd_sys_script_t proc_net_t:file getattr; allow httpd_sys_script_t proc_net_t:file read; allow httpd_sys_script_t rpm_log_t:file getattr; allow httpd_sys_script_t samba_log_t:dir getattr; allow httpd_sys_script_t self:appletalk_socket create; allow httpd_sys_script_t self:netlink_route_socket bind; allow httpd_sys_script_t self:netlink_route_socket create; allow httpd_sys_script_t self:netlink_route_socket getattr; allow httpd_sys_script_t self:netlink_route_socket nlmsg_read; allow httpd_sys_script_t self:netlink_route_socket read; allow httpd_sys_script_t self:netlink_route_socket write; allow httpd_sys_script_t self:socket create; allow httpd_sys_script_t self:socket ioctl; allow httpd_sys_script_t self:udp_socket create; allow httpd_sys_script_t self:udp_socket ioctl; allow httpd_sys_script_t self:unix_dgram_socket create; allow httpd_sys_script_t sendmail_log_t:dir getattr; allow httpd_sys_script_t squid_log_t:dir getattr; allow httpd_sys_script_t sysctl_net_t:dir search; allow httpd_sys_script_t sysfs_t:dir getattr; allow httpd_sys_script_t var_log_t:dir read; allow httpd_sys_script_t var_log_t:file getattr; allow httpd_sys_script_t var_t:dir read; allow httpd_sys_script_t wtmp_t:file getattr; |
Then run the following commands, as root:
checkmodule -M -m -o sawmill.mod sawmill.te semodule_package -o sawmill.pp -m sawmill.mod semodule -i sawmill.pp
These commands package up and install a SE module which allows $PRODUCT_NAME to perform all of its operations. Once you have run these commands, $PRODUCT_NAME should function as a CGI program.
` } # selinux removing_database_fields = { label = "Removing Database Fields" question = "How do I remove fields from the database to save space?" short_answer = "Delete the database.fields entry from the profile .cfg file, and delete any xref groups and reports that use it." long_answer = `Deleting database fields reduces the size of the database, and reduces the time required to build the database. Here's how you can delete a database field:
Using a text editor, edit the .cfg file for your profile, in LogAnalysisInfo/profiles.
Search for "database = {" and then search forward from there for "fields = {" to find the database fields section. Comment out the field you don't want (or delete it). For instance, to remove the screen_dimensions field, change this section:
screen_dimensions = { label = "DOLLARlang_stats.field_labels.screen_dimensions" type = "string" log_field = "screen_dimensions" suppress_top = "0" suppress_bottom = "2" always_include_leaves = "false" } # screen_dimensions
to this:
# screen_dimensions = { # label = "DOLLARlang_stats.field_labels.screen_dimensions" # type = "string" # log_field = "screen_dimensions" # suppress_top = "0" # suppress_bottom = "2" # always_include_leaves = "false" # } # screen_dimensions
screen_dimensions = { date_time = "" screen_dimensions = "" hits = "" page_views = "" } # screen_dimensions
to this:
# screen_dimensions = { # date_time = "" # screen_dimensions = "" # hits = "" # page_views = "" # } # screen_dimensions
By default, there will also be a report for the field, which has to be removed. Search for "reports = {", then search forward for the appropriate report name, which is the same as the database field name. Comment it out or delete it. For instance, search for "screen_dimensions = {", and then comment it out, replacing this:
screen_dimensions = { report_elements = { screen_dimensions = { label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" type = "table" database_field_name = "screen_dimensions" sort_by = "hits" sort_direction = "descending" show_omitted_items_row = "true" omit_parenthesized_items = "true" show_totals_row = "true" starting_row = "1" ending_row = "10" only_bottom_level_items = "false" columns = { 0 = { type = "string" visible = "true" field_name = "screen_dimensions" data_type = "string" header_label = "%7B=capitalize(database.fields.screen_dimensions.label)=}" display_format_type = "string" main_column = "true" } # 0 1 = { header_label = "%7B=capitalize(database.fields.hits.label)=}" type = "number" show_number_column = "true" show_percent_column = "true" show_bar_column = "true" visible = "true" field_name = "hits" data_type = "int" display_format_type = "integer" } # 1 2 = { header_label = "%7B=capitalize(database.fields.page_views.label)=}" type = "number" show_number_column = "true" show_percent_column = "false" show_bar_column = "false" visible = "true" field_name = "page_views" data_type = "int" display_format_type = "integer" } # 2 } # columns } # screen_dimensions } # report_elements label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" } # screen_dimensions
to this:
# screen_dimensions = { # report_elements = { # screen_dimensions = { # label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" # type = "table" # database_field_name = "screen_dimensions" # sort_by = "hits" # sort_direction = "descending" # show_omitted_items_row = "true" # omit_parenthesized_items = "true" # show_totals_row = "true" # starting_row = "1" # ending_row = "10" # only_bottom_level_items = "false" # columns = { # 0 = { # type = "string" # visible = "true" # field_name = "screen_dimensions" # data_type = "string" # header_label = "%7B=capitalize(database.fields.screen_dimensions.label)=}" # display_format_type = "string" # main_column = "true" # } # 0 # 1 = { # header_label = "%7B=capitalize(database.fields.hits.label)=}" # type = "number" # show_number_column = "true" # show_percent_column = "true" # show_bar_column = "true" # visible = "true" # field_name = "hits" # data_type = "int" # display_format_type = "integer" # } # 1 # 2 = { # header_label = "%7B=capitalize(database.fields.page_views.label)=}" # type = "number" # show_number_column = "true" # show_percent_column = "false" # show_bar_column = "false" # visible = "true" # field_name = "page_views" # data_type = "int" # display_format_type = "integer" # } # 2 # } # columns # } # screen_dimensions # } # report_elements # label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" # } # screen_dimensions
Now you need to remove the report element from the single_page_summary report. Search for single_page_summary, then search forward for the field name (e.g., search for "screen_dimensions = {"). Again, comment out the whole report element or delete it, replacing this:
screen_dimensions = { label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" type = "table" database_field_name = "screen_dimensions" sort_by = "hits" sort_direction = "descending" show_omitted_items_row = "true" omit_parenthesized_items = "true" show_totals_row = "true" starting_row = "1" ending_row = "10" only_bottom_level_items = "false" columns = { 0 = { type = "string" visible = "true" field_name = "screen_dimensions" data_type = "string" header_label = "%7B=capitalize(database.fields.screen_dimensions.label)=}" display_format_type = "string" main_column = "true" } # 0 1 = { header_label = "%7B=capitalize(database.fields.hits.label)=}" type = "number" show_number_column = "true" show_percent_column = "true" show_bar_column = "true" visible = "true" field_name = "hits" data_type = "int" display_format_type = "integer" } # 1 2 = { header_label = "%7B=capitalize(database.fields.page_views.label)=}" type = "number" show_number_column = "true" show_percent_column = "false" show_bar_column = "false" visible = "true" field_name = "page_views" data_type = "int" display_format_type = "integer" } # 2 } # columns } # screen_dimensions
with this:
# screen_dimensions = { # label = "%7B=capitalize(pluralize(print(database.fields.screen_dimensions.label)))=}" # type = "table" # database_field_name = "screen_dimensions" # sort_by = "hits" # sort_direction = "descending" # show_omitted_items_row = "true" # omit_parenthesized_items = "true" # show_totals_row = "true" # starting_row = "1" # ending_row = "10" # only_bottom_level_items = "false" # columns = { # 0 = { # type = "string" # visible = "true" # field_name = "screen_dimensions" # data_type = "string" # header_label = "%7B=capitalize(database.fields.screen_dimensions.label)=}" # display_format_type = "string" # main_column = "true" # } # 0 # 1 = { # header_label = "%7B=capitalize(database.fields.hits.label)=}" # type = "number" # show_number_column = "true" # show_percent_column = "true" # show_bar_column = "true" # visible = "true" # field_name = "hits" # data_type = "int" # display_format_type = "integer" # } # 1 # 2 = { # header_label = "%7B=capitalize(database.fields.page_views.label)=}" # type = "number" # show_number_column = "true" # show_percent_column = "false" # show_bar_column = "false" # visible = "true" # field_name = "page_views" # data_type = "int" # display_format_type = "integer" # } # 2 # } # columns # } # screen_dimensions # } # report_elements
Finally rebuild the database.
To understand why there are hits shown on both /somedir/ and /somedir, where "somedir" is the name of a directory (folder) in the web site, it is necessary to understand what happens when there is a browser that tries to access http://hostname/somedir . That URL is incorrect (or at best, inefficient), because it lacks the trailing divider, which implies that somedir is a file. Here's what happens in this case:
The web browser asks for a file named /somedir .
The server checks, and finds that there is no file by that name (because it's a directory). It responds with a 302 redirect to /somedir/, which basically means, "no such file, but there is a directory; maybe that's what you meant?"
The browser accepts the redirect, so now it requests a directory named /somedir/
The server notes that there is a directory by that name, and that it contains an index or default file. It responds with a 200 event, and the contents of the index file.
This looks like this in the web logs:
$PRODUCT_NAME reports this as two hits, because it is two hits (two lines of log data). $PRODUCT_NAME differentiates the aggregate traffic within a directory from traffic which directly hits a directory, by using /somedir/ to represent aggregation of traffic in the directory, and using "/somedir/ (default page)" in graphical reports to represent hits on the directory itself (i.e., hits which resulted in the display of the default page, e.g., index.html or default.asp). So in graphical reports, the second hit above appears as a hit on "/somedir/ (default page)".
A good solution to this is to make sure that all links refer to directories with the trailing slash; otherwise the server and browser have to do the elaborate dance above, which slows everything down and doubles the stats.
Another option is to reject all hits where server response starts with 3, using a log filter like this one:
This discards the first hit of the two, leaving only the "real" (corrected) one.
In summary, hits on /somedir/ in reports represent the total number of hits on a directory, including hits on the index page of the directory, any other files in that directory, and any other files in any subdirectory of that directory, etc. Hits on /somedir in reports represent the 302 redirects caused by URLs which lack the final /. Hits on "/somedir/ (default page)" represent hits on the default page of the directory.
For information about selecting the default page using a report filter, see {=docs_chapter_link('report_filters')=}.
` } sessions_with_username = { label = "Tracking Sessions with Usernames instead of IPs" question = "$PRODUCT_NAME shows IP addresses, or hostnames, in the Sessions reports, but I want it to show usernames instead. How can I do that?" short_answer = "Edit the profile .cfg, and change sessions_visitor_id_field to the username field." long_answer = `$PRODUCT_NAME calls this the "session user" field, or the "session visitor ID" field. This is the field which differentiates users; if the value in this field is different, for two events, $PRODUCT_NAME assumes that those events are from two different users, and therefore are not part of the same session.
By default, $PRODUCT_NAME uses the "client IP" field (or "hostname", or "source IP", or others, depending on the log format) to differentiate users. But if you have username information in your logs, it is sometimes better to use the username to differentiate sessions, because it better identifies an individual, especially in environments where individuals may use multiple IP addresses.
To do this, edit the profile .cfg file, which is in the LogAnalysisInfo/profiles $lang_stats.directory, using a text editor. Search for this line (its full location is log.field_options.sessions_visitor_id_field):
sessions_visitor_id_field = "hostname"
and change "hostname" to "username" (or "cs_username", or "x_username", or "user", or whatever the field is called in your log data; you can see a list of field names by running $PRODUCT_NAME from the command line with "$PRODUCT_EXECUTABLE_DOCS -p {profilename} -a ldf"). For example change it to this, if your username field is called "username":
sessions_visitor_id_field = "username"
Then, rebuild the database (or delete the LogAnalysisInfo/ReportCache $lang_stats.directory), and view a session report, and $PRODUCT_NAME will recompute your session reports using the user field.
` } hipaa = { label = "Support for HIPAA and Sarbanes-Oxley Compliance" question = "Does $PRODUCT_NAME produce reports for HIPAA and Sarbanes-Oxley (SOX) compliance?" short_answer = "Yes, run the Single-Page Summary report." long_answer = "$PRODUCT_NAME produces reports that will track the network usage, network security and give a comprehensive view of who is accessing your website at any given date or time. The Single-Page Summary report will give the network detection and audit history reporting that is needed to be compliant with both HIPAA and SOX.
" } geolite = { label = "GeoIP database in $PRODUCT_NAME is not as accurate as the one on the Maxmind site" question = "Some of the IP addresses in my data are not resolved properly to country/region/city by $PRODUCT_NAME. I know that $PRODUCT_NAME uses the MaxMind GeoIP database, and when I go to the MaxMind site, their demo resolves these IPs properly. Why isn't $PRODUCT_NAME doing the same as the online GeoIP demo?" short_answer = "$PRODUCT_NAME uses the GeoLite City database, a less accurate (and less expensive) version of the GeoIP City database. To get full accuracy, buy GeoIP City from MaxMind." long_answer = `MaxMind provides two tiers for their City database: GeoIP City and GeoLite City. They do not provide GeoIP City for bundling with products like $PRODUCT_NAME, so $PRODUCT_NAME includes the GeoLite City database. GeoLite City is less accurate than GeoIP City, so the results you get from $PRODUCT_NAME using its default GeoLite City database will be less accurate than using GeoIP City. Since the web demo of GeoIP on the MaxMind site uses GeoIP City, there will be some cases where $PRODUCT_NAME cannot place an IP, but the web demo can.
The solution is to upgrade to the full GeoIP City database, which you can do directly through MaxMind. That database is a drop-in replacement for GeoLite City, so once you have purchased it, you can drop it in on top of the GeoIP-532.dat file in the LogAnalysisInfo $lang_stats.directory in your $PRODUCT_NAME installation, and rebuild your databases, and you will get a more accurate geographical location.
` } format_durations_for_excel = { label = "Formatting Durations for Excel" question = "When I export CSV, durations appear as numbers, which Excel doesn't understand. How can I format durations to work with Excel?" short_answer = "Add an extra column to the spreadsheet to convert them to fractional days; or use a custom database field in the report element." long_answer = `Excel represents durations in days, so "1" is one day, and "1/24" is one hour. But $PRODUCT_NAME represents them as seconds for some log formats, milliseconds for others, and microseconds for a few. To format them as durations in Excel, they must be converted. This can be done either after the export, in Excel, or before the export, in $PRODUCT_NAME.
The easiest way, in most cases, is to add a new column in the exported spreadsheet, to convert between the units. For instance, if column E is the "time taken" field in milliseconds, create a new column with formula "=En/(1000*24*60*60)" where n is the row number, and fill down to populate the whole column. This will create a column whose values are "time taken" in days. Then format the cells of that column to use any "time" format, and it will be formatted as a time, in hour, minutes, seconds, etc.
If formatting after the export is not possible, or not efficient, you can do the conversion in $PRODUCT_NAME, but it's considerably more involved.
For this example, we'll assume we're dealing with the "time-taken" field in IIS web logs, called time_taken in $PRODUCT_NAME.
1. Create a database field with a custom expression.
This custom expression is to format the time-taken value in the standard duration_milliseconds format of $PRODUCT_NAME. Do this by editing the profile CFG file (in LogAnalysisInfo/profiles) with a text editor, finding the time_taken database field. Search for "database = {"; then search downward from there for "fields = {"; then search downward from there for "time_taken = {"), and duplicating it, adding a time_taken_excel_format database field underneath the time_taken database field:
time_taken_excel_format = { label = "time taken (Excel format)" type = "string" log_field = "time_taken" display_format_type = "duration_milliseconds" expression = \`format(cell_by_name(row_number, 'time_taken'), 'duration_milliseconds')\` } # time_taken_excel_format
2. Add this as a column to the report you'll be exporting. For instance, if the report is the hour_of_day report, find its column in the CFG file by searching from the top for "statistics = {", then searching down from there for "reports = {", then searching down from there for "file_type = {"; then searching down from there for "columns = {". Copy the time_taken column, and edit the duplicate to look like this:
time_taken_excel_format = { header_label = "time taken (Excel format)" type = "string" show_number_column = "true" show_percent_column = "false" show_bar_column = "false" visible = "true" field_name = "time_taken_excel_format" data_type = "string" display_format_type = "duration_milliseconds" } # time_taken_excel_format
3. Rebuild the database; then when you export this report, it will include a new "time taken (Excel format)" column, with standard $PRODUCT_NAME duration formatting ("Y years, D days, HH:MM:SS.MMM").
` } # format_durations_for_excel mysql_locks_exceeded = { label = "Error with MySQL: \"The total number of locks exceeds the lock table size\"" question = 'When I try to build a database, or view reports, I get an error, "The total number of locks exceeds the lock table size". How can I fix this?' short_answer = "Increase theinnodb_buffer_pool_size
in my.cnf (my.ini), to 256M."
long_answer = `
This occurs when MySQL runs out of locks, which for an InnoDB database occurs when the buffer pool
is full. You can fix this by increasing the size of the buffer pool, by editng the innodb_buffer_pool_size
option in my.cnf (my.ini), to set innodb_buffer_pool_size
to a number higher than the default
(which is typically 8M); for instance:
innodb_buffer_pool_size = 256M
Then, restart MySQL, and try the $PRODUCT_NAME operation again.
` } # mysql_locks_exceeded oraclerror = { label = "Error with Oracle: \"ORA-01000: maximum open cursors exceeded\"" question = 'When building a database with Oracle, I get an error, "ORA-01000: maximum open cursors exceeded." What can I do to fix this?' short_answer = "Increase open_cursors to 1000 in your Oracle server." long_answer = `Though $PRODUCT_NAME does not directly use cursors, some ODBC drivers use several hundred cursors when $PRODUCT_NAME builds a database through them. This can cause an Oracle error if the maximum number of permitted cursors is insufficient.
You can monitor the number of open cursors by running this query against your Oracle database:
SELECT v.value as numopencursors ,s.machine ,s.osuser,s.username FROM V\\$SESSTAT v, V\\$SESSION s WHERE v.statistic# = 3 and v.sid = s.sid;
To fix the problem increase the maximum number of cursors with this query:
ALTER SYSTEM SET open_cursors = 1000 SCOPE=BOTH;
It is not necessary to restart the database server after running this command--it will affect the running instance immediately.
` } # oraclerror no_result = { label = "The background process terminated unexpectedly" question = "$PRODUCT_NAME displays the following error: \"The background process terminated unexpectedly, without returning a result.\" What does that mean, and how can I fix it?" short_answer = "$PRODUCT_NAME has probably crashed, so this could be a bug in $PRODUCT_NAME. See the long answer for suggestions." long_answer = `This error message means that $PRODUCT_NAME tried to do a long task, like a report generation or a database build, and while it was trying to display progress for the task, it noticed that the task was no longer running, but had not properly computed and stored its result. A task always returns a result, so this means that something has gone wrong internally in $PRODUCT_NAME. The most likely cause is a crash: the background task crashed, so it will never able to complete and return the result.
A crash is often due to a bug in $PRODUCT_NAME, but it's also possible if $PRODUCT_NAME runs out of memory. Make sure there is enough memory available; if you watch the memory usage while you repeat the task, does it seem to reach a high level, near the maximum memory of the system, before failing? If so, you may need more memory in your system, in order to perform that task.
If it's not memory, try running the task from the command line. If it's a database build, you can run it from the command line using this: {=docs_faq_link('commandlinebuild')=}. If it's a crash during the report generation, you can run it from the command line similarly to a database build, but using "-a grf -rn reportname -ghtd report" instead of "-a bd", where reportname is the internal name of the report. Run $PRODUCT_NAME from the command line with "-p profilename -a lr" to get a list of reports. For instance,
sawmill -p myprofile -a grf -rn single_page_summary -ghtd report
will generate the single-page summary to a $lang_stats.directory called "report". If this report fails, it may give a better error message about what happened to it.
Whether it fails or succeeds, email $SUPPORT_EMAIL with the outcome of your test. If possible, include the profile, and enough log data to reproduce the error (up to 10 MB, compressed). Report that you are seeing a crash on report generation (or database build, or whatever), and we will attempt to reproduce it on our own systems, determine the cause, and fix it, or help you resolve it, if it's not a bug.
` } urlsinpix = { label = "Tracking URLs in Cisco PIX log format" question = "How can I track full URLs, or HTTP domains, or resolved hostnames, when analyzing PIX log data?" short_answer = "You can't track full URLs or HTTP domains, because PIX doesn't log them; but you can turn on DNS lookup in the PIX or in $PRODUCT_NAME to report resolved hostnames." long_answer = "The Cisco PIX log format can be configured to log hostnames as well as IPs; if it does, the PIX plug-in will report the hostnames. This is the preferred way to get hostname information from PIX. If that's not an option, $PRODUCT_NAME can be configured to look up IP addresses using the DNS Lookup section of the Config page. In this case, the IP address field value will be replaced by the resolved hostname, so this resolved hostname will appear in the IPs reports. PIX does not log URLs, however, so it is not possible for $PRODUCT_NAME to report domains accessed. PIX reports lines like this:
Accessed URL 12.34.56.78:/some/file/test.html
This shows the source IP, which we have from another line, and the URL stem, which is slightly useful, but it does not show the domain; and resolving the IP just gives the resolved hostname, not the domain from the URL. Still, it's better than nothing; resolving the hostname might give something like server156.microsoft.com, which at least tells you it's microsoft.com traffic, even if you can't tell whether it was mdsn.microsoft.com or www.microsoft.com.
PIX can also be configured to log hostnames in the Accessed URL lines, which looks something like this:
Accessed URL 12.34.56.78 (server156.microsoft.com):/some/file/test.html
But this has the same problem; it shows the hostname, not the HTTP domain. It seems that the HTTP domain is not available from PIX log data.
The reason we recommend doing DNS lookup in PIX rather than $PRODUCT_NAME are twofold:
1. DNS lookup after-the-fact may give a different hostname than it would have given at the time, and the one at the time is more accurate.
2. DNS lookup in $PRODUCT_NAME replaces the IP address with the hostname, so the IP is not available in the reports. DNS lookup in PIX *adds* the hostname as a separate field, so both are available in the reports.
" } mysqlmacx64 = { label = "MySQL and x64 MacOS" question = `I installed $PRODUCT_NAME on a 64-bit (x64) Mac, and now it says, "This profile uses a MySQL database, but MySQL is not enabled in this build." Why?` short_answer = "MySQL does not currently work on x64 MacOS." long_answer = "Because there is not a current version of MySQL available for x64 MacOS, it is not possible to build or use MySQL databases on x64 MacOS with $PRODUCT_NAME. When a x64 MacOS version of MySQL becomes available (from the makers of MySQL), we will add support in $PRODUCT_NAME. For now, use the x86 version of $PRODUCT_NAME, which will run on x64 MacOS, and can use MySQL." } # mysqlmacx64 restore = { label = "Backup and Restore" question = "How do I backup and restore my $PRODUCT_NAME installation, or a particular profile and its database?" short_answer = "Backup and restore the LogAnalysisInfo folder when no update or build is running, or for one profile. For MySQL also backup and restore the MySQL database." long_answer = "If you're using the internal database, you can back up the LogAnalysisInfo $lang_stats.directory in your $PRODUCT_NAME installation $lang_stats.directory, to back up the entire installation; and you can restore it to restore the entire installation. This will back up profiles, databases, users, preferences, scheduled tasks, and more. The backup and restore must occur when there is no database update or rebuild in progress; it is fine if there is a report generation in progress.
^M^MIf you're using a MySQL database, you can do the backup/restore as described above, and you will also need to back up the MySQL database for each profile. By default, the MySQL database's name is the same as the internal name of the profile, but it can be overridden in Database Options, in the Config section of the profile. Consult the MySQL documentation for information on backing up and restoring a database.
^M^MTo backup or restore a particular profile, backup or restore the profile file from LogAnalysisInfo/profiles, and the database folder from LogAnalysisInfo/Databases, and if you're using MySQL, backup and restore the MySQL database for the profile.
" } # restore permission_denied = { label = "Permission Denied Errors" question = `On Windows, I sometimes get "permission denied" errors, or "volume externally altered" errors, or "file does not exist" error when building a database. But sometimes, it works. What can cause this sort of sporadic file error?` short_answer = "An anti-virus or anti-malware software, which is actively scanning your $PRODUCT_NAME installation folder, can cause this. Disable scanning of $PRODUCT_NAME's data folders, in the anti-virus product." long_answer = `Some anti-virus software, and anti-malware software, actively scans the entire disk, looking for viruses or other malware. This sort of scanning can interfere with $PRODUCT_NAME's operation, if the software scans $PRODUCT_NAME's data files, or database. The anti-malware software interferes in two ways: (1) it opens $PRODUCT_NAME's data files, and holds them open while $PRODUCT_NAME is trying to write to them, during database builds, which causes Windows to refuse $PRODUCT_NAME write access to its own internal database files, and (2) if the malware detects a virus signature in one of $PRODUCT_NAME's database files, it may delete or modify that file, corrupting the database. The second scenario can occur even if there is no actual virus present, because $PRODUCT_NAME's database files are binary files, which can potentially contain any possible virus signature due to random permutations of the data; and worse, because $PRODUCT_NAME is often used to scan web logs, mail logs, and even antivirus logs which naturally contain virus signatures of the viruses which were encountered by the logging devices or servers.
Even when anti-virus scanning does not cause errors in $PRODUCT_NAME, it can greatly reduce the performance of $PRODUCT_NAME, as both fight for access to the same files. The performance impact can be 20 times or greater--a database which might normally takes 1 hour to build might take 20 hours or more.
The solution is to disable scanning of $PRODUCT_NAME's directories. Anti-malware should not be completely turned off--it is important to the security of your system, but most products can be selectively disabled, so they will not scan particular folders. In a default installation, $PRODUCT_NAME is found in the Program Files folder of the C: drive, so disabling scanning of the $PRODUCT_NAME folder there will greatly improve the performance and reliability of $PRODUCT_NAME.
` } # permission_denied dynamiccasterror = { label = "Relocation error: __dynamic_cast_2" question = 'When I try to run $PRODUCT_NAME, I get an error "relocation error: $PRODUCT_EXECUTABLE_DOCS: undefined symbol: __dynamic_cast_2". How can I fix this?' short_answer = "This is a GNU library incompatibility; build $PRODUCT_NAME from source instead of using the binary distribution." long_answer = `This occurs on UNIX systems, and is due to $PRODUCT_NAME being built expecting a different version of the GNU libraries than the one you have on your system (libstdc++). In other words, this is an operating system incompatibility -- we're building on a different version than you're running on.
The best solution is to use the "encrypted source" version of $PRODUCT_NAME, rather than the binary distribution for your platform; i.e., choose "encrypted source" as the "operating system" when you're downloading $PRODUCT_NAME. This version requires that you have a C/C++ compiler installed on your system. Follow the instructions to build $PRODUCT_NAME from source -- it's easy. The resulting binary will run properly on your system
If you don't have a compiler installed, please contact $SUPPORT_EMAIL.
` } # dynamiccasterror ftplogsource = { label = "Downloading Log Data by FTP" question = "Can $PRODUCT_NAME be configured to automatically FTP log files from multiple servers, and add them daily to a database?" short_answer = "Yes." long_answer = "Yes; just select one of the FTP log sources when $PRODUCT_NAME asks you where your data is. $PRODUCT_NAME can FTP one or more log files from any FTP server, anonymously or with a username/password." } clientsecurity = { label = "Protecting Clients' Statistics" question = "Can $PRODUCT_NAME be configured to limit access to statistics, so that a customer can only see the statistics associated with their section of my web site?" short_answer = "Yes, you can password protect statistics in several ways." long_answer = "Yes. $PRODUCT_NAME provides several ways to do this. In general, you will create a separate user for each client, and a separate profile for each client. Then you will configure their user to be non-administrative, and to have permission to access only their own profile. Finally, you will set up their profile to show only their data, either by pointing it only at their files, or (if their data is interleaved with other clients' data), by using log filters to discard all events from the log which don't belong to them.
" } scheduling = { label = Scheduling question = "Can $PRODUCT_NAME be configured to automatically analyze the access log for my site on a shared server once a day at a given time?" short_answer = "Yes, if you run it stand-alone, or if your server has a scheduling program." long_answer = "It depends on your web server. If you run $PRODUCT_NAME as a stand-alone program (rather than as a CGI program) on your server, then you can use $PRODUCT_NAME's built-in Scheduler to do this. If you can't run it stand-alone or don't want to, then you can still set up automatic database builds if your server has its own scheduling program (like cron or Windows Scheduler).
" } excludeip = { label = "Excluding an IP Address or Domain" question = "How can I exclude hits from my own IP address, or from my organization's domain?" short_answer = "Add a Log Filter to exclude those hits." long_answer = "One way to do this is to use a global filter in the statistics, and use \"!(hostname within '123.124.125.126')\", and this is often the first thing people try, but it's not the best choice. The speed of a statistics filter depends on the number of items checked, so if there are 100,000 IP addresses in your log file, and you check all 100,000, then $PRODUCT_NAME will take up to 100,000 times longer to generate each page. That is probably not what you had in mind. A much better option is to use the Log Filters.
Log filters are used to filter out or modify log data as it is being read (rather than filtering database data as it is being browsed, like the statistics filters). You can get to the Log Filters by clicking Show Config in the profiles list, and clicking the Log Filters category.
You want to create a filter that will reject any log entries whose hostname field is your IP address. If your IP address is 128.128.128.128, the filter you want is this:
The name of the field (\"hostname\" here) depends on your log data -- use the name that your log data uses. For instance, IIS W3C format calls the field c_ip, so for IIS you would use this:
You can get a list of the fields in your profile by running $PRODUCT_NAME from the command line with \"-p profilename -a llf\".
The next time you rebuild the database, hits from your IP address will be rejected, and will not appear in the statistics.
Rejecting all hits from a particular domain is very similar; if your domain is mydomain.com, and your server is set to look up IP addresses, then you can use this filter:
If your server logs hostnames as IP addresses (and does not resolve them to hostnames with DNS), you can use the subnet for your domain instead; for instance, if all hits from mydomain.com will come from the subnet 128.128.128, then you can use this filter:
Yes, very well. Most statistics packages will only show you the \"top paths\" or maybe the entry and exit pages; $PRODUCT_NAME shows you all the paths visitors took through the sites, in an easily navigated hierarchical report. You get complete data about every path that every visitor took through your site, click-by-click. For even more detail, you can zoom in on a particular session in the \"individual sessions\" report, to see the full log data of each click in the session.
" } resources = { label = "Resource Usage" question = "How much memory/disk space/time does $PRODUCT_NAME use?" short_answer = "It depends on how much detail you ask for in the database. It uses very little if you use the default detail levels." long_answer = "Memory usage depends mostly on the complexity of your data set (not the size). If your database has fields with millions of unique values, it will use many megabytes for each of those fields. It's uncommon for any particular field to require more than 100M, but in extreme cases, fields can use over 1G.
Disk usage is roughly 200% and 300% the size of your uncompressed log data. In some cases, you may need 400% of your uncompressed data. So if you're processing 500 GB of log data, you'll need about 1500 GB of disk space to hold the database.
The time to process a dataset is roughtly proportional to the size of a database. As of 2004, on a moderately fast single-CPU system, $PRODUCT_NAME typically processes between 5,000 and 10,000 lines of log data per second.
" } largelogs = { label = "Processing Large Log Files" question = "How large of a log file can $PRODUCT_NAME process?" short_answer = "There are no limits, except those imposed by the limitations of your server." long_answer = "There is no fundamental limit -- given enough memory, disk space, and time, you can process the world. We've processed log files terabytes in size, billions of lines long, and been able to browse their statistics at full complexity in real time, with no troubles.
" } logformats = { label = "Supported Log Formats" question = "What sorts of log files can $PRODUCT_NAME process?" short_answer = "$PRODUCT_NAME can handle all major log formats and many minor formats, and you can create your own custom formats." long_answer = "$PRODUCT_NAME is not just for web server logs, though it's well suited to that task. $PRODUCT_NAME also supports firewall logs, proxy logs, mail logs, antivirus logs, network logs, FTP logs, and much more.
Click here for the full list of {=docs_chapter_link('logformats')=}.
It automatically detects all the formats it supports, and chooses appropriate settings for the format.
We're continually adding new log formats, so the list above will keep growing. However, due to the large number of format requests, we cannot add all the formats that are requested. If your log format is not recognized by $PRODUCT_NAME, and you need support for a format, we can add it to $PRODUCT_NAME for a fee; contact $SUPPORT_EMAIL for details.
If you want to analyze a log in a different format, $PRODUCT_NAME also lets you create your own format description file; once you've done that, your format becomes one of the supported ones--$PRODUCT_NAME will autodetect it and choose good options for it, just like any built-in format.
$PRODUCT_NAME's format description files are very flexible; almost any possible format can be described. If you have an unsupported format and you'd like help writing a format file, please contact $SUPPORT_EMAIL, and we'll write a format file for you, at no charge.
" } peakperiods = { label = "Peak Period Reports" question = "Does $PRODUCT_NAME do \"peak period\" reports (by weekday, or hour)?" short_answer = "Yes." long_answer = "Yes. $PRODUCT_NAME lets you break your statistics down by any of a large number of criteria, and by more than one at a time. Among these criteria are \"day of week\" and \"hour of day,\" so you can see weekday or hour information just by adding the appropriate field to your database.
" } weeklystatistics = { label = "Weekly Statistics" question = "Can I see the number of hits per week? Can I see a \"top weeks\" report?" short_answer = "Yes, by using the Calendar, and/or creating a database field and a report tracking \"weeks of the year.\"" long_answer = "The date/time field in $PRODUCT_NAME tracks years, months, days, hours, minutes, and seconds. Each of these units fits evenly into the larger unit (24 hours in a day, 12 months in a year, etc.). Because weeks do not fit evenly into months, $PRODUCT_NAME cannot easily fit weeks into the date/time hierarchy. Still, there are several ways to see weekly statistics.
One way is to use the Calendar. In the Calendar, each week is represented as a link called \"week\"-- clicking the link applies a filter to the date/time field that shows the hits on those seven days. This lets you zoom in on a particular week, so you can see the statistics for that week, or you can switch to other views to learn more about the activity for that week. However, if you do it that way you can't see a list or graph of weeks, with the hits for each week, the way you can for days in the \"Days\" report.
If you need a weekly graph or table, you need to track the \"week of the year\" log field in your database. The week of the year is a number between 1 and 52 that represents the week of the year (e.g. 1 means January 1 through January 8, etc.). You can track the week of the year field like this:
Open the profile file ($PRODUCT_NAME/LogAnalysisinfo/profiles/profilename .cfg) you want to add week_of_year reports to in your favorite text editor (notepad).
Search for \"database = {\", then search for \"fields = {\" and scroll down until you see \"day_of_week = {\"
Copy this line and all lines until the line \"} # day_of_week\" and paste it all just underneath.
Where you see day_of_week in the new section change it to week_of_year (except use \"string\" where you see \"display_format_type\"), so it becomes:
day_of_week = { label = \"$lang_stats.field_labels.day_of_week\" type = \"string\" log_field = \"day_of_week\" display_format_type = \"day_of_week\" suppress_top = \"0\" suppress_bottom = \"2\" always_include_leaves = \"false\" } # day_of_week week_of_year = { label = \"$lang_stats.field_labels.week_of_year\" type = \"string\" log_field = \"week_of_year\" display_format_type = \"string\" suppress_top = \"0\" suppress_bottom = \"2\" always_include_leaves = \"false\" } # week_of_year
Then search for \"reports = {\" and duplicate (by copy/paste as above) an existing report (the Day of week report is a good choice), and again where you see day_of_week in the new section change it to week_of_year (except use \"string\" where you see \"display_format_type\").
Then search for \"reports_menu = {\" and then \"date_time_group = {\" and duplicate (by copy/paste as above) an existing report menu (the Day of week report is a good choice), and again where you see day_of_week in the new section change it to week_of_year (except use \"string\" where you see \"display_format_type\").
Save the changes you have made .
Rebuild the database.
The new report will show you traffic for each week of the year.
" } timeofday = { label = "Time of Day Statistics" question = "Does $PRODUCT_NAME do time of day?" short_answer = "Yes." long_answer = "Yes, $PRODUCT_NAME can pinpoint your hits to the second. By default, it also breaks down hits by hour, so you can detect peak usage and other hourly information. The Log Detail report show complete information about each event, down to the second, so you can zoom in on any part of your statistics, and then zoom down to the level of the log data to see event-by-event second-by-second what occurred.
" } uniquevisitors = { label = "Unique Visitors" question = "Can $PRODUCT_NAME count unique visitors?" short_answer = "Yes, using unique hostname or using cookies." long_answer = "Yes; $PRODUCT_NAME can tell you the number of unique visitors for any item in the database, including the number of visitors for a particular day, the number of visitors from a particular domain, the number of visitors who hit any particular page or directory, or any other type of data $PRODUCT_NAME can display.
By default, $PRODUCT_NAME uses the hostname field of your log data to compute visitors based on unique hosts. That works for all log files, but it's a somewhat inaccurate count due to the effect of proxies and caches. If your log data tracks visitors using cookies, you can easily configure $PRODUCT_NAME to use the cookie information instead, by changing the \"visitors\" database field so it is based on the cookie log field instead (in the Log Filters section of the profile Config). See also {=docs_faq_link('visitorcookies') =}.
" } visitorcookies = { label = "Counting Visitors With Cookies" question = "Can $PRODUCT_NAME count visitors using cookies, rather than unique hostnames?" short_answer = "Yes -- it includes a built-in log format to do this for Apache, and other servers can be set up manually." long_answer = "Yes. The reason you'd want to do this is that using unique browsing hostnames (or IPs) to count visitors is an imprecise method, since the same actual visitor may appear to come from several hostnames -- the same person may dial up and receive random IP addresses, or in some extreme cases, their ISP may be set up so that they have a different IP address for each hit, or several actual visitors may appear as one hostname if they're all using the same proxy. The solution to this problem is to set your web server to use cookies to keep track of visitors. Apache and IIS can be configured to do this, and in both cases, $PRODUCT_NAME can be configured to use the cookie log field, instead of the hostname, as the basis for its \"visitor\" field. To do this, edit your profile (in LogAnalysisInfo/profiles) with a text editor, find the \"visitors\" database field (look for \"database = {\", then \"fields = {\", then \"visitors = {\"), and change the log_field value to your cookie field; for instance, if your cookie field is cs_cookie, change it to log_field = \"cs_cookie\". Note that this will only work if your entire cookie field tracks the visitor cookie, and does not track any other cookes; if you have multiple cookies, you can't use the whole cookie field as your visitor ID, and you need to use the approach described below to create a visitor_id field and use a regular expression to extract your visitor cookie into it, and then change log_field to visitor_id.
If your server or environment already tracks visitors by cookie, you can skip this section. If not, you need to add a bit of JavaScript to each of your web pages, to assign cookies to each visitor. To do this, copy the log_analysis_info.js file, from the Extras folder of your $PRODUCT_NAME installation, into a folder called js, in your web server root directory, and add this to every possible entry page (best to add it to every page):
In the case of Apache, it's even easier, because $PRODUCT_NAME includes a log format descriptor for a special \"combined format plus visitor cookie\" log format. The format is just normal combined format, with the visitor ID stuck at the front of each log entry. You can log in this format by adding the following lines to your httpd.conf file:
CookieTracking on CookieExpires \"2 weeks\" CustomLog /var/log/httpd/cookie.log \"%{cookie}n %h %l %u %t \\\"%r\\\" %>s %b \\\"%{Referer}i\\\" \\\"%{User-Agent}i\\\"\"
(replace /var/log/httpd/cookie.log above with the pathname of the log you want to create). When you point $PRODUCT_NAME at this log file, it will recognize it as an \"Apache Combined With Visitor Cookies\" log, and it will set up the log filter described above for you, so you don't have to do any manual profile at all.
IIS has built-in support for visitor cookies -- just turn on logging of the Cookie field (extended property), or tell IIS to use \"W3C Extended Log File Format\" for logging, and you'll get cookies in your log data. Once you've done, that, you'll need to create a \"visitor_id\" log field to hold the cookie information, and use that field as the bases for your visitor database field.
If your cookie field contains more than just a visitor ID, you'll need to extract the visitor ID part of the field, and put it into a separate $PRODUCT_NAME's \"visitor id\" log field. This can be done using a regular expression filter with variable replacement. First, you'll need to create a visitor ID log field. You can do this by editing the profile .cfg file (in the profiles $lang_stats.directory of the LogAnalysisInfo $lang_stats.directory in your installation), and find the log.fields group (search for \"log =\" and then forward from there for \"fields =\"). Add the following log field:
visitor_id = { label = \"visitor ID\" type = \"flat\" }
Next, in the same .cfg file, change database.fields.visitors.log_field to visitor_id
(i.e. search for \"database =\", then search for \"fields =\", then search for \"visitors =\", and then
set the log_field
value within visitors to visitor_id
), so the visitors field
uses the visitor_id to determine whether two events are from the same visitor.
Then, add a log filter (in the Log Filters section of the profile Config, or in the log.filters section of the .cfg file) to extract the visitor ID from the cookie. For example example, suppose that the cookie field value looks like this:
var1=value1&var2=value2&lavc=123456789&var3=value3
The lavc cookie (the visitor id, 123456789 in this case) is buried inside the field, surrounded by other cookie names and values. To extract it you need a filter that grabs the part after lavc= and before &. This can be done most easily with the following filter:
(for IIS, the value in quotes will be ASPSESSIONID[A-Z]*=([^&]*). This filter finds a section of the field starting with &lavc=, followed by a series of non-& characters, followed by a &, and it sets the visitor id to the sequence of non-& characters it found (123456789, in this case).
Once you've added the visitor id log field, and the filter to set it, and modified the visitors database field to use the visitor id as its log field, rebuild the database. $PRODUCT_NAME is now using the lavc value from your cookie field as your visitor id, which should make your visitors counts more accurate.
" } robotstxt = { label = robots.txt question = "Why do I see hits on a file called \"robots.txt\" in my statistics?" short_answer = "robots.txt is a file that tells search engine spiders and robots what they can do, so a hit on robots.txt means that a spider visited your site." long_answer = "robots.txt is a \"standard\" file that appears at the root level of many web sites to tell search engine robots what to do on the site. Robots, also known as spiders, are computer programs that attempt to systematically visit and catalog all the pages on the Web. robots.txt tells the robots what they can or can't do on the site (whether they can index the site, which pages they may not index, etc.). Any correctly written robot will hit that page first, and follow the instructions it finds there. So the hits you're seeing are from robots.
If you don't have a robots.txt file on your site, the robots don't actually get any information--they get a \"404 File Not Found\" error instead, which they generally interpret as \"index whatever you want.\"
" } # graph_options = { # label = "Changing graph options" # question = "How can I change the graphs to pie charts, or add a legend, or change which field is graphed?" # short_answer = "Edit the report element in the profile .cfg file -- see long answer for full instructions." # long_answer = "To changing graphing options, #you need to edit the profile .cfg file. The file is in the profiles folder of the LogAnalysisInfo folder #of your $PRODUCT_NAME installation.
#First, open that file with a text editor. Then search for reports = {
to find the
#beginning of the section which describes reports. There will be one group within that section for each report
#in the profile. Find the group corresponding to the report you want to edit. For instance, the file_type report
#would start like this:
# file_type = { # report_elements = { # file_type = { # label = \"%7B=capitalize(pluralize(print(database.fields.file_type.label)))=}\" # type = \"table\" # database_field_name = \"file_type\" # sort_by = \"hits\" # sort_direction = \"descending\" # show_omitted_items_row = \"true\" # omit_parenthesized_items = \"true\" # show_totals_row = \"true\" # starting_row = \"1\" # ending_row = \"10\" # only_bottom_level_items = \"false\" # show_graph = \"false\" # columns = { # 0 = { # type = \"string\" # ... ##In most cases, there is a single report element for each report, but if you're editing a report like #the single page summary, there may be multiple groups in the report_elements group; find the group corresponding #to the table you're editing. #
If graphing is off, as in this example, you can first turn it on by changing the show_graph line to
#show_graph = true
and adding a graph section:
# file_type = { # report_elements = { # file_type = { # label = \"%7B=capitalize(pluralize(print(database.fields.file_type.label)))=}\" # type = \"table\" # database_field_name = \"file_type\" # sort_by = \"hits\" # sort_direction = \"descending\" # show_omitted_items_row = \"true\" # omit_parenthesized_items = \"true\" # show_totals_row = \"true\" # starting_row = \"1\" # ending_row = \"10\" # only_bottom_level_items = \"false\" # show_graph = true # graph = { # numerical_fields = { # hits = true # } # } # columns = { # 0 = { # type = \"string\" # ... ##
This will add a graph to the report; by default it will be a bar chart with colored bars and a legend. #If you prefer a pie chart, use these graphing options instead:
## show_graph = true # graph = { # pie_chart = true # numerical_fields = { # hits = true # } # } ##
If you'd like to graph a field other than \"hits\", change \"hits\" to the internal name of a numerical
#database fields; e.g. \"page_views\" or \"events\" or \"messages\" or whatever the appropriate fields
#are for your log format. You can get a list of available database field names by running $PRODUCT_NAME
#from the command line with the options: -p profilename -a ldf
.
You may also need to delete the ReportCache $lang_stats.directory in the LogAnalysisInfo $lang_stats.directory #for changes to take effect immediately; if you don't, reloading the report may load a cached version of the report #from before the changes.
. #" # } # report_filters = { # label = "Adding report filters" # question = "How can I add a filter which permanently applies to just one report, or report element?" # short_answer = "Add it in the profile .cfg file -- see long answer for full instructions." # long_answer = "To add a filter to a single report, or a report element (table and graph) within a report, #you need to edit the profile .cfg file. The file is in the profiles folder of the LogAnalysisInfo folder #of your $PRODUCT_NAME installation.
#First, open that file with a text editor. Then search for reports = {
to find the
#beginning of the section which describes reports. There will be one group within that section for each report
#in the profile. Find the group corresponding to the report you want to edit. For instance, the file_type report
#would start like this:
# file_type = { # report_elements = { # file_type = { # label = \"%7B=capitalize(pluralize(print(database.fields.file_type.label)))=}\" # type = \"table\" # ... ##
To add a filter to the report, add an extra section within the report group, but outside the #report_elements group, like this:
## file_type = { # filter = { # expression = \"(page within '/dir1/')\" # } # report_elements = { # file_type = { # label = \"%7B=capitalize(pluralize(print(database.fields.file_type.label)))=}\" # type = \"table\" # ... ##
This adds a filter to to the report so it will show only events where the page field is inside the #directory /dir1/. Any filter expression is permitted here; here are some other examples:
## expression = \"(date_time > '01/Jan/2004 00:00:00')\" # expression = \"(date_time > '01/Jan/2004 00:00:00') and (page within '/dir1/')\" # expression = \"(date_time > '01/Jan/2004 00:00:00') and !(page within '/dir1/')\" ##
The first shows only hits since the beginning of 2004; the second shows only hits since the beginning of 2004 where #the hit was in /dir1/; the third shows only hits since the beginning of 2004 where the hit was not in /dir1/.
#If you want the filter to apply to a particular report element but not the the whole report, you can add this same #type of \"filter =\" expression inside the report element, e.g.:
## file_type = { # report_elements = { # file_type = { # label = \"%7B=capitalize(pluralize(print(database.fields.file_type.label)))=}\" # type = \"table\" # filter = { # expression = \"(page within '/dir1/')\" # } # ... ##
This creates a report where the filter applies to only that filter element. Most reports have only one report #element, but a report like Single-page summary may have several, and each can have its own filters.
#" # } favicon = { label = "favicon.ico" question = "Why do I see a hits on a file called \"favicon.ico\" in my statistics?" short_answer = "favicon.ico is a special icon file that Internet Explorer looks for when it first visits the site." long_answer = "Recent versions of Microsoft Internet Explorer, Safari, and other web browsers have a feature that lets web site owners define an icon for their site, which will appear in the address bar, the Favorites menu, and other places. If you create an icon file called favicon.ico in a directory of your web site, then any page in that directory that is bookmarked will appear in the Favorites menu with your custom icon. The browser checks for this file whenever a bookmark is created, so if you don't have the file, it will show up as a 404 (file not found) link. As a side note, this is a good way to see who is bookmarking your site.
" } multi_column_reports = { label = "Adding columns to report tables" question = "How can I add additional columns to report tables, e.g. to add a single report which reports source IP, destination IP, source port, and destination port?" short_answer = "Edit the report in the profile .cfg file to add a new item to the columns group." long_answer = "Edit the profile .cfg file, which is in the profiles folder of the LogAnalysisInfo folder. Look for \"reports = {\" to find the reports list. Look down until you find a report which shows a table for one of the fields you want, e.g. in the source_ip/destination_ip/source_port/destination_port example, you would look for the destination_port report (the actual name of this report, and of field values, will vary depending on your log format). The report will look something like this:
destination_port = { report_elements = { destination_port = { label = \"\\$lang_stats.destination_port.label\" type = \"table\" database_field_name = \"destination_port\" sort_by = \"events\" sort_direction = \"descending\" show_omitted_items_row = \"true\" omit_parenthesized_items = \"true\" show_totals_row = \"true\" starting_row = \"1\" ending_row = \"10\" only_bottom_level_items = \"false\" show_graph = \"false\" columns = { 0 = { type = \"string\" visible = \"true\" field_name = \"destination_port\" data_type = \"string\" header_label = \"%7B=capitalize(database.fields.destination_port.label)=}\" display_format_type = \"string\" main_column = \"true\" } # 0 1 = { header_label = \"%7B=capitalize(database.fields.events.label)=}\" type = \"events\" show_number_column = \"true\" show_percent_column = \"false\" show_bar_column = \"false\" visible = \"true\" field_name = \"events\" data_type = \"int\" display_format_type = \"integer\" } # 2 } # columns } # destination_port } # report_elements label = \"Destination report\" } # destination_port
There may be other columns, but the two shown here are a minimum -- one for the destination port field, and one for the \"events\" field (might be called \"packets\" or something else). This describes a report which has two columns: destination port and number of events.
To add a four-column source_ip/destination_ip/source_port/destination_port report, copy the entire thing and change the name to custom_report. Then duplicate the destination_port column three times, and edit the copies so they're source_ip, destination_ip, and source_port. The result:
custom_report = { report_elements = { custom_report = { label = \"Custom Report\" type = \"table\" database_field_name = \"destination_port\" sort_by = \"events\" sort_direction = \"descending\" show_omitted_items_row = \"true\" omit_parenthesized_items = \"true\" show_totals_row = \"true\" starting_row = \"1\" ending_row = \"10\" only_bottom_level_items = \"false\" show_graph = \"false\" columns = { source_ip = { type = \"string\" visible = \"true\" field_name = \"source_ip\" data_type = \"string\" header_label = \"%7B=capitalize(database.fields. source_ip.label)=}\" display_format_type = \"string\" main_column = \"true\" } # source_ip destination_ip = { type = \"string\" visible = \"true\" field_name = \"destination_ip\" data_type = \"string\" header_label = \"%7B=capitalize(database.fields. destination_ip.label)=}\" display_format_type = \"string\" main_column = \"true\" } # destination_ip source_port = { type = \"string\" visible = \"true\" field_name = \"source_port\" data_type = \"string\" header_label = \"%7B=capitalize(database.fields. source_port.label)=}\" display_format_type = \"string\" main_column = \"true\" } # source_port destination_port = { type = \"string\" visible = \"true\" field_name = \"destination_port\" data_type = \"string\" header_label = \"%7B=capitalize(database.fields.destination_port.label)=}\" display_format_type = \"string\" main_column = \"true\" } # destination_port 1 = { header_label = \"%7B=capitalize(database.fields.events.label)=}\" type = \"events\" show_number_column = \"true\" show_percent_column = \"false\" show_bar_column = \"false\" visible = \"true\" field_name = \"events\" data_type = \"int\" display_format_type = \"integer\" } # 2 } # columns } # custom_report } # report_elements label = \"Custom report\" } # custom_report
Finally, add it to the reports_menu list (again, this is easiest to do by duplicating the existing reports_menu item for destination port), like this:
custom_report = { type = \"view\" label = \"Custom Report\" view_name = \"custom_report\" visible = \"true\" visible_if_files = \"true\" } # custom_report
And you should have a Custom Report item in your reports menu, which links to the multi-column report.
If you're creating a two-column report, you can get an indented layout with subtables (rather than a \"spreadsheet\" layout) by adding the following section to the report group (e.g. right above the \"} # custom_report\" line, above):
sub_table = { ending_row = \"10\" omit_parenthesized_items = \"true\" show_omitted_items_row = \"true\" show_averages_row = \"false\" show_totals_row = \"true\" } # sub_table
This sub_table node will work only for reports which have exactly two non-numerical columns (e.g. source_ip/destination_ip).
" } graph_field = { label = "Changing the graph field" question = "How do I change the field which is graphed, e.g. from page view to bandwidth?" short_answer = "Edit the profile .cfg file, and change the field name in the numerical_fields section of that report element." long_answer = `If you want to change the field which is graphed, in the graph above a particular report table, do this:
Open the profile .cfg file (in the profiles $lang_stats.directory of the LogAnalysisInfo $lang_stats.directory) in a text editor.
Find the Reports section (Search for "reports = {
")
Scroll down until you see the report you want to change, for example "Days", so look for "days = {
"
A few lines below that find the line that says "graph = {
". You should see this:
numerical_fields = { hits = "true" } # numerical_fields
Change this so that it reads:
numerical_fields = { visitors = "true" } # numerical_fields
You can substitute any numerical field name here, so page_views/hit/visitors/bytes etc (you must use the internal name for the field, not the "display" label).
NOTE: In some cases, just refreshing the browser may not actually show the new graph. You can be sure that once these changes have been made $PRODUCT_NAME will be producing the new graph, it is the browsers job to show you it. You may need to empty your browsers cache to be emptied for this to be seen.
` } saveasnew = { label = "Saving filters during Save as New Report" question = "When I'm saving a report for the first time but what about my filters?" short_answer = "If you have no filters active, then they will not be saved with your report." long_answer = `You have created a new report, when you select "Save as New Report, under the "Miscellaneous" button, if you have no active date or general filters active, then you will not be saving those with this report. Those selections will be dimmed out in the dialogue box. If you want filters turned on, select those in the "Filters" menu and then save your report.
` } shortlongterm = { label = "Short- and Long-term Databases" question = "How do I get high detail for recent hits, and also long-term statistics, without using too much disk space?" short_answer = "Use two databases, one for high-detail short-term data, and one for low-detail long-term data." long_answer = "A common problem encountered by $PRODUCT_NAME users is the conflict between wanting to see all possible statistics, and wanting to be able to generate the database in a reasonable amount of time, memory, and disk space. With large web sites, it is often impossible to have both of these things--you probably don't have the computing power to fully analyze your multi-gigabyte logs and still have full detail on ten database fields. One easy solution to this problem is to have two databases (two profiles).The first, the \"all data\" profile, represents your entire log but with significant limitations on the fields (perhaps only the top two levels of the hostname field, the top two levels of the referrer, date/time to the day level only, etc.).
The second, the \"recent data\" profile, includes a filter to discard all log entries older than a day, or a week, or a month, depending on how recent you want it and how much data you get in a day. The \"recent data\" profile, since it has fewer log entries in it, can have much more detailed information; the date/time can go to the second level, all the hostnames can be there (for full visitor information), etc.
Using this technique, you'll end up with two views of your log data. For long-term trends, you can use the \"all data\" profile. For recent access information, in detail, you can use the \"recent data\" profile. The combined size and processing time of the two profiles will be much lower than if they were combined into one with the duration of \"all data\" and the depth of \"recent data.\"" } zoomfarther = { label = "Zooming Further" question = "How do I see more levels of statistics (i.e. how can I zoom in further)?" short_answer = "Increase the \"suppress below\" level for this database field in the profile options." long_answer = "
$PRODUCT_NAME limits the number of levels you see by default to save memory, disk space, and time. You can increase the levels on any database field like this:
Using a text editor, open the .cfg file for your profile, in the LogAnalysisInfo/profiles folder.
Find the database = { section.
Within that section, find the fields = { section.
Within that section, find the database field you want to change.
Increase the suppress_below value for that field.
Save the file.
Rebuild the database.
Then you'll be able to see as many levels as you chose. See also {=docs_chapter_link('resources')=}.
" } zoomsingle = { label = "Zooming on single files" question = "How do I see the number of downloads for a particular file (i.e. a newsletter PDF, or a template file PDF)?" short_answer = "Select PDF from the 'File Types' table and then use the Zoom Menu to Zoom to the URL's report, then Select the PDF you need to get an overview of that file. " long_answer = "Click on the 'Content' report group from the left hand menu, then click on the 'File Types' report. When the File Types report loads, click on 'PDF' from the table and the table will re-load with just a PDF entry and a menu will appear above the table with a list of all tables in it.
From that drop down ({=docs_user_chapter_link('user_report_zoom_to')=}) select the 'Pages' or 'URL's' (it could be either) option and you should then see a page load that has only pages/URL's in where the file type is PDF. You can then select the PDF from that list, and you would next see an Overview for that file only.
This type of filtering uses the {=docs_user_chapter_link('user_filter_zoom')=}, they are temporary filters that are applied on the report(s) as you click about (Zoom about) the report. By clicking on any item from the left hand menu they are cancelled and you are returned to that reports default view where there are no filters set (unless the default has a filter set via the Report Editor, in which case that filter set will be applied).
If you want to filter items in the report, have it apply to the whole report and be able to turn on the filter when you need to, it is better to use the {=docs_user_chapter_link('user_filter_global')=} that are available from the Filter Icon in the Toolbar (just above the report). These can be created and enabled and disabled as you need them, and you only need to create them once and they are stored under your username and the profile you are using for use next time you need them, Zoom filters are not stored anywhere and need re-applying each time you need the filter set.
" } datatypes = { label = "Definitions of Numerical Fields" question = "In web server analyses, what are \"hits,\" \"page views,\" \"bandwidth\" or \"bytes,\" \"visitors,\" or \"sessions\"? In media analyses, what are \"stream duration,\" \"play duration,\" \"pause duration,\" \"session duration,\" \"events,\" \"streams,\" or \"concurrent connection,\" or \"successful accesses\"?" short_answer = "Hits are accesses to the server; page views are accesses to HTML pages; visitors are unique visitors to the site, and sessions are visits to the site. Play duration is the most useful measure of time actually spent playing; pause duration is time spent paused; stream and session duration are the time spent connected; events is the total number of log lines; stream is the unique number of streams accessed; successful accesses are the number of non-error streaming events." long_answer = `$PRODUCT_NAME can count web log traffic in several ways. Each way is counted independently of the others, and each has its own advantages in analyzing your traffic. The different types are:
Hits. Hits are accepted log entries. So if there are 5000 entries in your log file, and there are no log filters, and all the entries are valid (i.e. none of them have corrupt dates), then $PRODUCT_NAME will report 5000 hits for the file. If there are log filters that reject certain log entries, then those will not appear as hits. Log entries that are accepted by the log filters will count toward the hits totals. Because there are no default filters that reject, you will generally have nearly as many reported hits as you have log entries. You can view and edit the log filters by Opening your profile from the Administrative Menu, clicking Profile Options, and then clicking the Log Filters tab. See also {=docs_chapter_link('filters')=}.
Page views. Page views correspond to hits on pages. For instance, if you're analyzing a web log, and a hit on /index.html is followed by 100 hits on 100 images, style sheets, and JavaScript files, that appear in that page, then it will count as a single page view -- the secondary files do not add to the total. This is implemented in the log filters -- page views are defined as log entries that are accepted by the log filters, and that have a page_view value set to 1 by the log filters. Log entries that are accepted by the filters, but have page_view of 0 set by the filters do not contribute to the page views total. Therefore, you have complete control over which files are \"real\" page views and which are not -- if $PRODUCT_NAME's default filters do not capture your preferred definition of page views, you can edit them until they do. By default, page views are all hits that are not GIF, JPEG, PNG, CCS, JS, and a few others. See Hits, above, for more information on log filters.
Visitors. Visitors correspond roughly to the total number of people who visited the site. If a single person visits the site and looks at 100 pages, that will count as 100 page views, but only one visitor. By default, $PRODUCT_NAME defines visitors to be \"unique hosts\" -- a hit is assumed to come from a different visitor if it comes from a different hostname. This can be inaccurate due to the effects of web caches and proxies. Some servers can track visitors using cookies, and if your web logs contain this information, $PRODUCT_NAME can use it instead of hostnames -- just change the log_field value for the visitors database field to point to the cookie field, rather than the hostname field.
Bandwidth. Bandwidth is the total number of bytes transferred. It is available only in log formats that track bytes transferred. Bandwidth is tracked for every log entry that is accepted, whether it is accepted \"as a hit\" or \"as a page view\". For log formats which track both inbound and outbound bandwidth, $PRODUCT_NAME can report both simultaneously.
Sessions. Several of $PRODUCT_NAME's reports deal with \"session\" information, including the \"sessions overview\" and the \"paths (clickstreams)\" report. Sessions are similar to visitors, except that they can \"time out.\" When a visitor visits the site, and then leaves, and comes back later, it will count as two sessions, even though it's only one visitor.
Session events. A page view which occurs during a session is a session event. For web server logs, this number is similar to page views, but may be smaller, because it does not include page views which are not in any session. That can occur if the page view is a reload (two consecutive hits on the same page), or if the page view is a part of a session which has been discarded because it is too long.
Media servers have their own distinct numerical fields. Some of these are directly from the log data; others are computed by the $PRODUCT_NAME plug-in. Different plug-ins report different fields; fields may include:
Stream Duration. Available in Flash, Limelight, Wowza, and other analyses, this field reports the total amount of time spent streaming, including time spent playing, paused, or seeking.
Play Duration. Available in Wowza analysis, this field reports the total amount of time spent playing/viewing the video/audio stream. Time spent paused, or time spend seeking, is not included in this number. This is usually the most useful of the Duration fields.
Pause Duration. Avaiable in Wowza analysis, this field reports the total amount of time spent paused during a streaming session.
Session Duration. Available in Flash and other analyses, this field reports the total amount of time the session was active, from the first 'session start' event in the log to the 'session end' event, which includes the stream duration and also the time after the session starts and before streaming begins, and the time after streaming and before the end of the session.
Publish Duration. Available in Wowza analysis, this field reports the total amount of time the stream was published, from the first 'publish' event in the log to the 'unpublish' event, which includes the stream duration and also the time after connection and before streaming begins, adn the time after streaming and before disconnection.
Events. Available in Microsoft Media Server, Flash, Wowza, and other analyses, this reports the total number of server events, i.e., the total number of lines in the log data, regardless of the event type. This includes events like connect, disconnect, and errors, in addition to successful stream events.
Streams. Available in Wowza, this counts the number of unique values of the x-stream-id field, i.e., the number of unique streams viewed.
Successful Accesses. Available in Microsoft Media Server, this reports the number of stream attempts which did not result in an error, i.e., those events which had a status code in the 200s, or 408. Broken links and other errors are not counted in this number.
Concurrent Connections. Available in Wowza, Flash, Microsoft Media Server, and any other media format which uses the Media Reports snapon, this reports the number of concurrent connections to the server.
How Concurrent Connections are Calculated and Reported
$PRODUCT_NAME uses a database filter to sort the events chronologically, and then tags each event with a concurrent connection value representing the number of connections at the time of that event, as computed by examining the start and end of each event and counting the overlapping connections at each logged timestamp. For non-aggregating reports like Log Detail, this value is shown directly in the report. For aggregating reports (most reports are aggregating reports), the column shows the maximum number of concurrent connections for all aggregated events (e.g., for all events contributing to a particular line of a table report). This is a global number, computed globally across the entire dataset, so it is generally only useful for global reports like Overview, and for date/time reports like Year, Months, Days, etc. Adding this column to other reports will give results which are correct according to the algorithm described above, but probably are not what is expected. For instance, adding it to Countries will not give the number of simultaneous connections from each country; it will give the maximum number of simultaneous connections which existed to the whole server (from all countries) at a time when each country accessed the server. Similarly, this number does not show the maximum concurrent connections per stream name or per publishing point, when seen in the Streams report or similar. If you want to know the number of connections to a particular stream (rather than the number of connections to the server when the stream was accessed, which is what this global value would show), you can attach another Concurrent Connections snapon, and choose the stream name as the "resource" field; this will count concurrent connections separately, not globally but for each value of that field, which will give the expected results in a report of that field only.
Different log formats contain different types of information. All major web log formats include page, date/time, and browsing host information, but not all contain referrer and agent information. If your log format does not include referrer or agent information, $PRODUCT_NAME will not include that information in its database. The easiest way to get referrer or agent information is to change your web server's log format to an \"extended,\" or \"combined\" format, which includes referrer and agent information; then $PRODUCT_NAME will automatically include referrer and agent information in the database and in the statistics.
If it's not possible to change your log format, and you have a separate referrer log
(often called referer_log
), then you can analyze that log directly with
$(PRODUCT_NAME). Just point $PRODUCT_NAME at the log, and $PRODUCT_NAME should recognize it as a referrer log.
$PRODUCT_NAME will show statistics with referrer and page information.
Host and date/time information are not available in a standard referrer log, so
referrer and page is all $PRODUCT_NAME can extract. By using an extended or combined log format, you
will be able to do more powerful queries, for instance to determine the referrers in
the most recent week.
Similarly, if you can't configure your server to use extended or combined, but you have a separate agent log, you can analyze it with $PRODUCT_NAME by creating a separate profile that analyzes the agent (web browser and operating system) information only. Since an agent log contains only agent information, you won't be able to cross-reference the agent information with page, date/time, host, referrer, or anything else; to do that, you'll need an extended or combined format log.
To analyze error information, you'll need an error log (often called error_log
).
Just point $PRODUCT_NAME at your error log when you create the profile. Since the
error log contains only error messages, you won't be able to cross-reference the errors
against page, date/time, or any other fields; if you need cross-referencing of errors,
you may be able to get what you need by cross-referencing the \"server response\" field of your
normal web log to the fields you need cross-referenced; then apply \"404\"
as a filter on the server response field
and you'll see only those web site hits that generated
404 (file not found) errors.
$PRODUCT_NAME can answer this sort of question for any combination of fields. All you need to do is use the zoom filters (or global filters) to zoom in on the item you want specific information for, and then use \"default report on zoom\" to switch to the report that shows the data you want. For instance, if you want to know the top search engines for a particular search phrase, click Search phrases, then click a particular search phrase, and then choose \"Search engines\" from the \"Default report on zoom\" menu. The resulting table will show you a breakdown by search engine of the hits for the search phrase you selected.
" } sessionsforpage = { label = "Sessions For A Particular Page" question = "How can I see only the visitors that entered at a particular page, or only the visitors that hit a particular page at some point in their session?" short_answer = "Use the global filters to show only sessions containing that page; reports will only show sessions including that page." long_answer = "In the global filters, add a filter to show only sessions containing that page. Then return to the reports; until you remove that global filter, all reports will show only traffic for sessions containing a particular page.
" } sessionsforsearchengine = { label = "Sessions For A Particular Search Engine" question = "How can I see only the visitors that came from a particular search engine?" short_answer = "Direct that search engine to a particular entry page, and then use global filters to show only sessions for that page." long_answer = "Some information of this type is available in the \"Search engines\" view -- you can zoom in on a particular search engine by clicking its name there, and then switch to the top visitor hostnames view to see which hosts came from that search engine, and other information about the traffic from that search engine. But that only works for the first click, because after that, the log data no longer lists the originating search engine (the referrers are internal from that point on). So you can see how much traffic search engines brought, but what if you want to see what the visitors from a particular search engine did after they came to the site?
You can do that by using custom entrance pages and Global Filters. Start by pointing each search engine to its own URL, where possible. For instance, instead of pointing Overture to http://www.mysite.com/index.html, you can point it to http://www.mysite.com/index.html?source=overture Once you've done that, then all traffic from Overture will initially arrive at the /index.html?source=overture page. By showing only sessions containing that page (see {=docs_faq_link('sessionsforpage') =}), you can show the session activity of Overture visitors, including what paths they took, how long they stayed, and more.
You can do the same thing for any search engine, advertising campaign, or link exchange that allows you to choose your URL. It won't work quite as easily for broad search engines like Google, which let people enter your site at any point, but it's still possible to \"tag\" the URL similarly using a log filter -- see {=docs_faq_link('conversions') =}.
" } conversions = { label = "Tracking Conversions" question = "I want to track conversions-- i.e. I want to know which of my ads are actually resulting in sales. Can $PRODUCT_NAME do that?" short_answer = "Yes -- encode source information in your URLs and use global filters to show the top entry pages for your \"success\" page." long_answer = "If you advertise your web site, one of the most useful pieces of information you can get from $PRODUCT_NAME is information on \"conversions\"; i.e. how effective your ads are at actually generating sales, sign-ups, or whatever it is that makes your site a success. $PRODUCT_NAME can provide highly detailed conversion information with a little effort. Here's how you do it:
Make sure that every URL leading to your site is tagged with information that tells you where it came from. E.g. for an Overture keyword \"red umbrellas\" use http://www.mysite.com/?source=overture&keyword=red+umbrellas. Do the same for all ads. This is a good idea anyway (and Overture recommends it), but it's essential if you want to track conversions in $(PRODUCT_NAME). Do this for every link leading to your site. Obviously, you can't do this for URLs you have no control over (like Google searches), but you can do it for all your ads, which are the important ones from a conversion perspective.
Wait for some traffic to arrive with the parameterized URLs.
Remove the \"page parameters\" log filter, in the Log Filters section of the profile Config, so $PRODUCT_NAME will track page parameters (see http://sawmill.net/cgi-bin/sawmilldocs?ho+faq-pageparameters).
Go to the \"Entry pages\" view in your statistics. You should see all your full URLs there, with percentages if you want, which will tell you how much traffic each ad brought to your site. For instance, if you see that you got 1000 entries to the http://www.mysite.com/?source=overture&keyword=red+umbrellas page, then you know that your Overture ad for \"red umbrellas\" brought 1000 hits. That's useful information, but not conversion information--that comes next.
Edit the global filters in the reports, and set the filters to show only sessions that went through your \"success\" page. This is the page that people see after they've done whatever you wanted them to do. For instance, if success for you means a sale, then this would be the \"thank you for your order\" page. If success means that they sign up, this is the \"you have signed up\" page. If success means that they submitted a feedback form, this is the \"thanks for your feedback page.\"
Now you're looking at the \"Entry pages\" view, but it's been filtered to show only those sessions which eventually \"converted\". This is exactly what you want to know -- if you see 100 entrances at http://www.mysite.com/?source=overture&keyword=red+umbrellas , then you know that 100 visitors found your site from your \"red umbrellas\" ad on Overture, and eventually hit your success page later in the same session. This is pure marketing gold -- by comparing the total cost of the ad (e.g. if each click is 0.10, and there were 1000 total clicks, then you spent 100.00 on that keyword), with the total payout of the ad (e.g. if each \"success\" is worth 5.00 in currency, then you know you made 500.00 from the 100 successful \"red umbrellas\" clicks), you can tell whether the ad is worth it. In this example, you paid \\$100 for the ad and got 500.00 in sales from it -- keep that one running!
$PRODUCT_NAME can handle URLs/pages in any format, but by default it strips off the parameters (the part after the question mark) to save space in the database. most people don't need the parameters, but if you have a dynamic web site, you do. to see the parameters, so this:
Go to the Config section of your profile.
Click Log Filters.
Find the Log Filter which replaces everything after \"?\" with \"(parameters)\".
Delete that log filter.
Rebuild the database.
Now, when you look at the \"Pages\" or \"Pages/directories\" view, you should see your complete URLs, along with the parameters.
If you want to take it a step further, you can also set up log filters to extract certain sections of your URLs, and put them in custom fields, to make your statistics more readable. For instance, if you have a store with several items in it, you can create an \"items\" field, with an associated \"Top items\" view, and you can set up a log filter to extract the item number (or name) from the URL and put it in the \"items\" field. Or you can even set up a filter to extract the item numbers from your URLs, convert them to the actual name of the item, stick them in the \"item\" field, and report them in the \"top items\" view. This is an example of a \"custom field\" -- see {=docs_faq_link('custom_fields') =} for information on how to create one.
" } languagemodules = { label = "Language Modules--Localization and Customization" question = "Is $PRODUCT_NAME available in languages other than English? How can I change the output of $PRODUCT_NAME to be in a different language, or to use different wording?" short_answer = "$PRODUCT_NAME is currently available in English, German, and Japanese, and can be translated into any language fairly easily. Customization of output text is also easy." long_answer = "$PRODUCT_NAME has a feature designed for just this purpose, called Language Modules. Language modules are text files which contain all of the text that $PRODUCT_NAME ever generates. You can translate part or all of $PRODUCT_NAME into any language by modifying the language modules. English, German, and Japanese translations already exist. Language modules can also be used to customize the output of $PRODUCT_NAME in almost any conceivable way. For full details, see {=docs_chapter_link('languagemodules')=} in the online manual.
" } runatstartup = { label = "Running $PRODUCT_NAME at System Startup" question = "Can I set up $PRODUCT_NAME to start automatically when the computer starts up?" short_answer = "Yes; run it as a Service on Windows; use StartupItems under MacOS X; use the /etc/rc.d mechanism on UNIX systems that support it." long_answer = "$PRODUCT_NAME can be configured to run at startup in the same way any other program can, and the exact method depends on your operating system. Here's how:
On Windows:
$PRODUCT_NAME is automatically installed as a Service, and will be running as soon as installation is complete. The Service is set to automatically start when the system starts up. You can edit Service parameters, for instance to have it run as a different user, or to have it start manually, in the Services control panel.
On MacOS X:
Do the following as root:
Move the $(PRODUCT_EXECUTABLE_DOCS)d
file from the Extras/RH9 directory of your
$PRODUCT_NAME installation, to /etc/rd.d/init.d. Type
chkconfig --add sawmilld
chkconfig --level 2345 sawmilld on
to install it and turn it on.
Rename the $PRODUCT_NAME executable to $PRODUCT_EXECUTABLE_DOCS
(or change the name
of the executable in the script) and put it in /etc/$PRODUCT_EXECUTABLE_DOCS.
Put a symbolic link to LogAnalysisInfo in /etc/$PRODUCT_EXECUTABLE_DOCS/LogAnalysisInfo
(or you can put the actual directory there), using the ln -s
command,
e.g.
ln -s /usr/home/$PRODUCT_EXECUTABLE_DOCS/LogAnalysisInfo /etc/$PRODUCT_EXECUTABLE_DOCS/LogAnalysisInfo
(you'll need to create the directory /etc/$PRODUCT_EXECUTABLE_DOCS first).
ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc0.d/K15$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc1.d/K15$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc2.d/K15$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc3.d/S85$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc4.d/S85$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc5.d/S85$(PRODUCT_EXECUTABLE_DOCS)d ln -s /etc/rc.d/init.d/$(PRODUCT_EXECUTABLE_DOCS)d /etc/rc.d/rc6.d/K15$(PRODUCT_EXECUTABLE_DOCS)dIf you're not sure where to put the $PRODUCT_NAME links or what to call them, and you have Apache installed on your system, look for files with names containing
httpd
in /etc/rc.d or /etc/init.d, and use the same names and locations for $PRODUCT_NAME, replacing
httpd
with $(PRODUCT_EXECUTABLE_DOCS)d
.
$(PRODUCT_EXECUTABLE_DOCS)d
(or change the name
of the executable in the script) and put it in /bin or somewhere else in your
default path.
ln -s
command,
e.g. ln -s /usr/home/$PRODUCT_EXECUTABLE/LogAnalysisInfo /etc/$PRODUCT_EXECUTABLE/LogAnalysisInfo
(you'll need to create the directory /etc/$PRODUCT_EXECUTABLE first).
When you run $PRODUCT_NAME from the command line in UNIX by just typing the name of the program, it runs in the foreground. That means that you don't get your prompt back until $PRODUCT_NAME exits, and it also means that if you close your terminal window, the $PRODUCT_NAME server will terminate and you will not be able to use it anymore until you open another terminal window and restart $(PRODUCT_NAME). Often, that's not what you want--you want $PRODUCT_NAME to keep running after you close the window. You can do that by running $PRODUCT_NAME in the background.
To run $PRODUCT_NAME (or any other UNIX program) in the background, add an ampersand (a & character) to the end of the command line; for instance, you might use the following command line:
./$PRODUCT_EXECUTABLE &
if the name of your $PRODUCT_NAME program is $PRODUCT_EXECUTABLE
.
When you type this, you will see one line of output as $PRODUCT_NAME is backgrounded,
and a few lines from $PRODUCT_NAME describing the running web server,
and then you will have your shell prompt back, so you can type more commands.
At this point, $PRODUCT_NAME is running in the background. You can type exit
at the prompt to close the shell, or you can just close the window, and $PRODUCT_NAME
will continue to run in the background.
On some rare occasions, $PRODUCT_NAME may generate output to the shell console. This is not usually a problem, but on some systems, background programs that generate output may be suspended, and that can make $PRODUCT_NAME inaccessible. To prevent this from happening, you may want to use this command line instead:
nohup ./$PRODUCT_EXECUTABLE &
The \"nohup\" part of the command line stands for \"no hang-up\" and
prevents this sort of output-related suspension problem.
Unfortunately nohup
doesn't exist on all systems.
If you don't know if your system supports nohup
,
try including nohup
on the command line--if it doesn't run that way, don't use it.
You can see current background jobs started from the current terminal using the jobs
command (with most shells). You can terminate a background job by bringing it to the front using the fg
command and then using control-C, or using the kill
command together with its process ID.
You can find the process ID (pid) of any background process (including ones started in other windows)
using the ps
command. For more information about any of these commands, use the man
command (e.g. type man ps
), or consult your UNIX documentation.
$PRODUCT_NAME stores most of its data, including all internal databases, in a $lang_stats.directory called LogAnalysisInfo. By default, this is in the same $lang_stats.directory as the $PRODUCT_NAME binary. If you want it to be somewhere else, there are several options:
Create a symbolic link (non-Windows only). This works only on non-Windows systems. If $PRODUCT_NAME is installed in a directory called /some/dir/sawmill, and you want LogAnalysisInfo to be at /some/other/dir/LogAnalysisInfo, you can do this from the command line:
mv /some/dir/sawmill/LogAnalysisInfo /some/other/dir ln -s /some/other/dir/LogAnalysisInfo /some/dir
This creates a symbolic link from the installation location to the new location of LogAnalysisInfo, which $PRODUCT_NAME will automatically follow. By the way, you can also just move LogAnalysisInfo to /var/sawmill/LogAnalysisInfo, and $PRODUCT_NAME will look for it there.
Create a file LogAnalysisInfoDirLoc (for Windows).This is a text file containing the location of the LogAnalysisInfo folder. This is most useful on Windows; on other platforms you can use a symbolic link, as described above. For instance, if you installed $PRODUCT_NAME in C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8, and want LogAnalysisInfo to be at E:\\LogAnalysisInfo, you can move it from C:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 7\\LogAnalysisInfo to the E: drive, and then create a text file (with Notepad) called LogAnalysisInfoDirLoc, no .txt extension, and type E:\\LogAnalysisInfo as the contents of that text file. If $PRODUCT_NAME does not find a LogAnalysisInfo $lang_stats.directory in the $lang_stats.directory where it is running, it will look for that file, LogAnalysisInfoDirLoc, and it will look for LogAnalysisInfo in the location specified by the contents of that file.
Regular expressions are not fully standardized -- different programs that support \"regular expression\" may support slightly different features. For instance, some will let you use {N} to repeat the preceding expression N times, and some will not (they will require you to write the expression N times yourself). Some will let you use \\d to match any digit, and others will not (they will require you to use [0-9]. So the point of this question is, which of these \"non-standard\" features does $PRODUCT_NAME support? The answer depends on the platform you're running $PRODUCT_NAME on.
$PRODUCT_NAME's regular expressions vary depending on platform -- it uses the built-in regular expression library on some platforms, and the Boost library on others. Anything that is documented in {=docs_chapter_link('regexp')=} is available on all platforms. Anything that is not documented there may not be available. The easiest way to find out if something is available is to try it -- add a regular-expression filter to your Log Filters and see if it works. But if you want to make sure your profile is portable, and will work on other platforms, you should stick to the documented choices.
" } regexpcase = { label = "Regular Expression Case-sensitivity" question = "Are $PRODUCT_NAME's regular expressions case-sensitive?" short_answer = "Yes." long_answer = "Yes -- the regular expression Dog
matches Dog
,
but not dog
or DOG
. If you need to match
case-insensitively in a log filter, you can convert the field to lowercase first
(copy it to another temporary field if you don't want to modify the original), or you can explicitly list
upper and lower case values for every letter, e.g. [Dd][Oo][Gg] matches \"dog\" case-insensitively.
$PRODUCT_EXECUTABLE -p profilename -a bd -v egblpfd
."
long_answer = "Custom log formats and log filters can be difficult to debug from the graphical interface, because there is little feedback about what $PRODUCT_NAME is doing as it processes the log. Fortunately, $PRODUCT_NAME has a powerful feature called \"debugging output\" that makes debugging custom log formats and filters much easier.
To see the debugging output, you need to use a command-line version of $(PRODUCT_NAME). On Windows, that means using the $(PRODUCT_NAME)CL.exe program, and running it from the command prompt. On Unix, you can use the normal $PRODUCT_NAME executable, since it works on the command line. On MacOS, you need to use the MacOS X command-line version of $PRODUCT_NAME.
Using the command shell, go to the $PRODUCT_NAME installation directory (using the \"cd\" command). Then rebuild the database like this:
$PRODUCT_EXECUTABLE_DOCS -p profilename -a bd -v egblpfd | moreThis command rebuilds the database for the profilename profile, and
-v egblpfd
tells $PRODUCT_NAME to report a great deal of information
about what it's doing (other -v options are available, but egblpfd are the seven options which
are most useful for debugging profiles and filters). The results are piped through the \"more\" program,
so you can page through the output using the space bar. Lines starting with
\"Processing line\" show when $PRODUCT_NAME is processing a new log line.
Lines starting with \"Marking hits\" show the end results that are being
put into the database. Other lines provide information about log parsing and
filtering that can be very useful when you're trying to debug a problem in the
parsing of your custom format, or in your custom log filter.
"
}
copyprofile = {
label = "How to Copy a Profile"
question = "How can I create a new profile, by copying an old one?"
short_answer = "Take an existing profile and change the first line to the new name."
long_answer = `You can use an existing profile, keep it's features and create a new one from it by editing the first line:
It is not possible to rename the name of a profile through the web interface. You can create a new profile with the desired name, and delete the old one. If the original profile is customized , all customizations will have to be redone in the new profile.
To change the name of the profile without recreating it, you can edit the profile .cfg file using a text editor. The file is in LogAnalysisInfo/profiles. Search for "create_profile_wizard_info" in the file, and on the line above it, you will see the label of the profile. The label shows how the profile appears in the web interface, so changing this label line will change the name in the web interface. It does not change the "internal" name, however, which is used from the command line to refer to the profile.
If you also need to change the internal name, you will need to rename the profile .cfg file. Do this by changing the filename, e.g., oldprofile.cfg becomes newprofile.cfg. It is CRITICAL that you also change the first line of the .cfg file (using a text editor) to match the filename, without the .cfg extension; so the first line would change from:
oldprofile = {
to
newprofile = {
If you do not do this, the $PRODUCT_NAME profile list will give an error, rather than listing the profiles, and other parts of $PRODUCT_NAME may be broken as well. The first line of any .cfg file must always match the filename. Once the filename and first line have been changed, the internal name of the profile will be the new name, and you will also be able to use the new name from command lines. You may also need to manually edit LogAnalysisInfo/schedule.cfg, if you have scheduled tasks which refer to the old name.
` } # rename_profile schedulercgi = { label = "Using the Scheduler with CGI Mode" question = "How can I run $PRODUCT_NAME in CGI mode, and still use the $PRODUCT_NAME Scheduler?" short_answer = "Use an external Scheduler to run jobs or to call the $PRODUCT_NAME Scheduler, or run $PRODUCT_NAME in both CGI and web server modes." long_answer = "$PRODUCT_NAME's built-in scheduler can only run scheduled jobs if $PRODUCT_NAME is actually running when the job's time comes. That's fine if you're running $PRODUCT_NAME in web server mode, where it runs all the time. But in CGI mode, $PRODUCT_NAME only runs when someone is actively using it, so scheduled tasks will not run. There are three main solutions to this problem: use an external scheduler to call $PRODUCT_NAME's scheduler, use an external scheduler to run the jobs directly, or run $PRODUCT_NAME on both CGI and web server modes, with the CGI mode doing everything but the scheduled jobs, and web server mode handling those.
On UNIX, the most common scheduler is cron. You can set up cron to call $PRODUCT_NAME's scheduler by running the command (as root)
crontab -e
from the UNIX command line, and then adding
* * * * * sudo -u apache /full/path/to/$PRODUCT_EXECUTABLE -scheduler
to the resulting file. You will need to replace \"apache\" with the name of your CGI user, and
you will need to replace /full/path/to/$PRODUCT_EXECUTABLE
with the full pathname of your $PRODUCT_NAME executable. This tells cron tab to run $PRODUCT_NAME
every minute, as the CGI user, with the -scheduler
option (which tells $PRODUCT_NAME to run any
scheduled jobs, and exit).
Another option is to run your $PRODUCT_NAME database builds and other jobs directly with cron; for instance you could add a line like this:
0 0 * * * sudo -u apache /full/path/to/$PRODUCT_EXECUTABLE -rfcf configname -cm ud
(replace \"apache\" with the name of your CGI user) to update the profile specified by configname every night at midnight (the first number is the minute of the hour when the job should be run; the second number is the hour when the job should be run, and * * * means to run it every day).
Yet another option is to run $PRODUCT_NAME in web server mode as well as CGI mode, with the web server mode instance running only for the purpose of running jobs. The two will not interfere with each other; just start $PRODUCT_NAME from the command line using
/full/path/to/$PRODUCT_EXECUTABLE &
and it will continue to run until the next reboot. If you want $PRODUCT_NAME to automatically restart itself at system startup, see {=docs_faq_link('runatstartup') =}.
Unfortunately, Windows Scheduler does not let you run jobs every minute (like UNIX cron does), so you cannot use it to call the $PRODUCT_NAME Scheduler directly. However, other options are available. You can use the Windows Scheduler to run your $PRODUCT_NAME jobs directly. For instance, to set $PRODUCT_NAME to update the database for a particular profile every night, do this:
Open the Scheduled Tasks control panel.
Double-click Add Scheduled Task, and click Next.
Choose \"$PRODUCT_NAME (CGI)\" from the list (if it does not appear, Browse and locate the $(PRODUCT_NAME).exe file, which is usually at c:\\Program Files{='\\\\' . expand('$PRODUCT_NAME')=} 8{='\\\\' . expand('$PRODUCT_NAME')=}.exe), and click Next.
Click Daily and click Next.
Choose a time to run the build; sometime in the middle of the night (like midnight) is a good choice, and click Next.
Enter your username and password, and click Next.
Click \"Open advanced properties for this task when I click Finish,\" and click Next.
Add \"-p profilename -a ud\" to the end of the Run field, and click OK.
Now Windows Scheduler is configured to update your database automatically every day.
Another option is to run $PRODUCT_NAME in web server mode as well as CGI mode, with the web server mode instance running only for the purpose of running jobs. The two will not interfere with each other; just start $PRODUCT_NAME by double-clicking its icon (you can also configure it to start whenever your computer restarts, using Windows Scheduler), and scheduled jobs will run as long as $PRODUCT_NAME is running. If you need $PRODUCT_NAME to be running while you are logged out, see {=docs_faq_link('windowsservice') =}.
" } hundredpercentcpu = { label = "$PRODUCT_NAME Uses Too High a Percentage of CPU" question = "When I process log data with $PRODUCT_NAME, it uses most or all of my processor; it says it's using 90%, or even 100% of the CPU. Should it be doing that? Is that a problem?" short_answer = "Yes, it should do that, and it's not usually a problem. Any CPU-intensive program will do the same. However, you can throttle it back if you need to, using operating system priorities." long_answer = "$PRODUCT_NAME is a \"CPU-bound\" program while it's processing logs, which means that the microprocessor (a.k.a. CPU) is the bottleneck; the disk feeds data to $PRODUCT_NAME as fast as the processor can handle it. Most programs you use daily (web browsers, mail programs, word processors, etc.) are probably not CPU-bound, but any number-crunching or data-crunching program is. Other examples of programs that are typically CPU-bound include compression/decompression programs like ZIP, 3D rendering programs, and encryption programs (or encryption breakers).
Any well-behaved operating system will give a CPU-bound process as much CPU as it has available, provided that the processing needs of all other processes are met as well. Because most systems use only a small fraction of their processing power, there is usually more than 90% free CPU available at any time. This CPU is wasted unless it is used, so if there's a program like $PRODUCT_NAME that's continually asking for more CPU, the operating system should and will give it as much CPU as possible. If nothing else is running on the system, $PRODUCT_NAME will use 100% of the CPU. Since nothing else needs the CPU, that's as it should be--if the operating system only gave $PRODUCT_NAME 50% of the CPU, it would take twice as long to process the log data, and during the other 50% of the time, the CPU would be sitting idle, wasting time. So don't worry if $PRODUCT_NAME is using nearly 100% of your CPU--that's the way it's supposed to be, and it will generally have no negative effects.
The one time you may see negative effects of $PRODUCT_NAME's CPU usage is if there are other CPU-bound or CPU-intensive programs running on the system. In this case, because all the programs want as much CPU as possible, the operating system will split the CPU evenly between them. For instance if there are three CPU-intensive processes running, each of them will get 33% of the CPU, and each will run 1/3 as fast as it would on a lightly loaded machine. If you have an important CPU-intensive process running on your server (for instance, a very busy web server), you may want to give $PRODUCT_NAME a lower priority than the other processes. You can do this on UNIX systems using the \"nice\" command, and on Windows systems using the Process Manager. When you set $PRODUCT_NAME's priority to lower than the rest, it will get less than its share of CPU time, and the other processes will run faster. $PRODUCT_NAME, of course, will run slower. Similarly, if other processes are interfering with $PRODUCT_NAME's performance and you don't care about the performance of the other processes, you can increase $PRODUCT_NAME's priority to make it run faster, at the expense of the other processes.
Even programs that are not normally CPU-bound will have moments when they become briefly CPU-bound. For instance, a web browser sits idle most of the time, using almost no CPU, but when you load a complex page, it briefly uses as much CPU as it can get to compute and display the page. During that period, if $PRODUCT_NAME is running, each program will get 50% of the CPU. So the layout will take twice as long as it does when $PRODUCT_NAME is not running, which will make the web browser feel more sluggish than usual. Other programs, and the operating system itself, will similarly feel more sluggish while $PRODUCT_NAME is processing the log data. This is an side effect of having a CPU-bound program running on the system--everything else will slow down. Setting $PRODUCT_NAME to a lower priority will help in this situation, because the web browser will get nearly 100% of the CPU (while $PRODUCT_NAME is temporarily halted) while it's rendering.
" } custom_fields = { label = "Creating Custom Fields" question = "How can I group my events in broad categories (like \"internal\" vs. \"external\" or \"monitoring\" vs. \"actual\"), and see the events on each category separately, or see them combined? How can I create content groups? How can I include information from an external database in my reports, e.g. include the full names of users based on the logged username, or the full names of pages based on the logged URL? How can I extract parts of the URL and report them as separate fields?" short_answer = "Create a new log field, database field, report, and report menu item to track and show the category or custom value, and then use a log filter to set the log field appropriately for each entry." long_answer = "It is often useful to report information in the reports which is not in the logs, but which can be derived from the information in the logs. For instnace, it is useful to see events in categories other than those which naturally fall out of the data. Natural categories for web logs include page directories (the page field), months (the date/time field), or visitor domains (the hostname field). Similarly, it is useful to derive related values from the log fields values, and report them as though they were in the log data; for instance, if you have a username, you may want to report the full name, organization, and other information about the username. $PRODUCT_NAME treats every value of every field as a category, so you can categorize by any field in your log data. You can take advantage of this feature to make your own categories, even if those categories are not immediately clear in the log data. Categories like these are called \"custom fields.\". One common use of custom fields is to separate internal hits (hits from you) from external hits (hits from other people). Another use is to separate monitoring hits (hits from programs you use to monitor your own site) from actual hits (hits by browsing people). Another similar categorization is spider hits (hits from search engine robots and other robots) vs. human hits (hits by browsing people). Custom fields are also used to show metadata associated with a particular item, for instance to show whois information from an IP address, full name from a username, and other information. $PRODUCT_NAME does some common custom fields for you (geographic location derived from IP, hostname derived from IP, web browser derived from user-agent, and many more), but if you need to derive your own custom field, $PRODUCT_NAME also provides you with the \"hooks\" you need to do it.
There are five steps to this (described in detail below):
Step 1: Create a log field
Step 2: Create a database field based on that log field
Step 3: Create a report based on that database field
Step 4: Create a report menu item for that report
Step 5: Create a log filter to populate the log field
Here are the details:
Edit the profile .cfg file, in the profiles $lang_stats.directory of the LogAnalysisInfo $lang_stats.directory, using a text editor. Search for \"log = {\" and then search from there for \"fields = {\", to find the log fields list. Create a new field as shown below; enter the \"internal\" field name before the = sign (use only lower case letters, numbers, and underbars in the internal name), and enter the display \"label\" in the \"label =\" line. For instance, if you name the field category, the name and label will be the same; if you name it \"my category\", the name will be my_category and the label will be \"my category\". For this example, we will use \"my category\" as the field label throughout, and my_category as the field name.
my_category = { label = \"my category\" type = \"flat\" index = \"0\" subindex = \"0\" } # my_category
Still editing the profile .cfg from above, search for \"database = {\" and then search from there for \"fields = {\", to find the database fields list. Add a field like this:
my_category = { label = \"my category\" log_field = \"my_category\" type = \"string\" suppress_top = \"0\" suppress_bottom = \"2\" } # my_category
Still editing the profile .cfg from above, search for \"statistics = {\" and then search from there for \"reports = {\", to find the database fields list. Find an existing table report; the file_type report may be a good choice; otherwise pick any report with 'type = \"table\"'. Copy this entire report, paste to duplicate it. Now edit the report to customize it for the new field. The edited version is shown below, with modifications in bold. The modifications are: 1) the report name and report element name have been changed, 2) the database_field_name has been changed so the table is generated from the my_category field, 3) the labels on the report element and table column have been changed to \"My Category\", 4) the field_name for first table column has been changed to my_category so the first column displays the my_category field values. The comments (#) have also been changed, though this is not essential.
my_category = { report_elements = { my_category = { label = \"My Category\" type = \"table\" database_field_name = \"my_category\" sort_by = \"hits\" sort_direction = \"descending\" show_omitted_items_row = \"true\" omit_parenthesized_items = \"true\" show_totals_row = \"true\" starting_row = \"1\" ending_row = \"10\" only_bottom_level_items = \"false\" show_graph = \"false\" columns = { 0 = { type = \"string\" visible = \"true\" field_name = \"my_category\" data_type = \"string\" header_label = \"My Category\" display_format_type = \"string\" main_column = \"true\" } # 0 1 = { header_label = \"%7B=capitalize(database.fields.hits.label)=}\" type = \"number\" show_number_column = \"true\" show_percent_column = \"true\" show_bar_column = \"true\" visible = \"true\" field_name = \"hits\" data_type = \"int\" display_format_type = \"integer\" } # 1 2 = { header_label = \"%7B=capitalize(database.fields.page_views.label)=}\" type = \"number\" show_number_column = \"true\" show_percent_column = \"false\" show_bar_column = \"false\" visible = \"true\" field_name = \"page_views\" data_type = \"int\" display_format_type = \"integer\" } # 2 3 = { header_label = \"%7B=capitalize(database.fields.visitors.label)=}\" type = \"number\" show_number_column = \"true\" show_percent_column = \"false\" show_bar_column = \"false\" visible = \"true\" field_name = \"visitors\" data_type = \"unique\" display_format_type = \"integer\" } # 3 4 = { header_label = \"\%7B=capitalize(database.fields.size.label)=}\" type = \"number\" show_number_column = \"true\" show_percent_column = \"false\" show_bar_column = \"false\" visible = \"true\" field_name = \"size\" data_type = \"float\" display_format_type = \"bandwidth\" } # 4 } # columns } # my_category } # report_elements label = \"My Category\" } # my_category
Still editing the profile .cfg from above, search for \"reports_menu = {\" to find the reports menu. This node describes the layout of the menu at the left of the reports. It includes hierarchical groups and report nodes within each group. Find a report menu item in there with 'type = \"view\"' (which means it clicks to a view on a report); duplicate that item and edit it so it looks like the node below. Again, the changes are to change the name of the node, the label, the view_name (which specifies which report it should click through to view), and optionally the comment:
my_category = { type = \"view\" label = \"My Category\" view_name = \"my_category\" visible = \"true\" visible_if_files = \"true\" } # my_category
If you want the report to be in a different group from the one it's in, you can move it inside the \"items =\" list of any other group, or directly into the reports_menu node to make it a top-level report (not in any group).
This step varies greatly depending on what you're doing. In broad, what you need to do here is to create a log filter (in the Log Filters editor of the Config section of the web interface, or you can also do it in the log.filters section of the profile .cfg, by searching to for \"log = {\" and then \"filters = \"). The log filter you create should set the value of your new field. It could be something as simple as this:
my_category = \"some value\"
to set the my_category field to the same constant value for every line, but that's not very useful. A slightly more useful example is to set it to part of another field, e.g.
my_category = substring(file_type, 1)
In this example, my_category is set to the same value as file_type, but without the first character. Much more complex manipulations are possible; you can use any expression here. You could set it like this:
my_category = agent . c_ip
to set my_category to the concatenation of the agent field and the c_ip field (which makes a pretty good \"unique visitor\" identified for web logs).
Here's one real-world example of the way you might create a lookup map to set the my_category field from the username field in web logs. Start by creating a file my_category_map.cfg in the LogAnalysisInfo $lang_stats.directory, using a text editor. In that file, create a my_category for each possible username, like this:
my_category_map = { jack = \"Sales\" jill = \"Sales\" bob = \"Marketing\" sue = \"Marketing\" sara = \"Legal\" ken = \"Engineering\" }
Then you can use this log filter:
if (subnode_exists(\"my_category_map\", username)) then my_category = node_value(subnode_by_name(\"my_category_map\", username)) else my_category = \"Unknown Category\"
This works because when you create a file my_category_map.cfg in LogAnalysisInfo, you're automatically creating a variable that $PRODUCT_NAME can access as \"my_category_map\" (as an aside, you can also use directories; e.g. if you create a file \"LogAnalysisInfo/log_filter_maps/my_category_map.cfg\" you can access it from log filters as log_filter_maps.my_category_map). The function subnode_exists() checks if there is a subnode if its first parameter node whose name matches the second parameter, so it will be true if the username exists in my_category_map. If it does exist, then it gets that subnode's value (e.g. \"Sales\") and puts it in the my_category database field; otherwise, it sets it to \"Unknown Category\".
This is a fairly simple example; almost infinite flexibility is possible -- see ({=docs_chapter_link('salang')=}).
" } # custom_fields addingsearchengines = { label = "Adding Search Engines" question = "Can I configure $PRODUCT_NAME to recognize search engines other than the ones it knows already?" short_answer = "Yes -- just edit the search_engines.cfg file in the LogAnalysisInfo $lang_stats.directory with a text editor." long_answer = "Yes; $PRODUCT_NAME's search engine recognition mechanism is easily extensible. All the search engines $PRODUCT_NAME knows are described in a text file called search_engines.cfg, which is found in the LogAnalysisInfo $lang_stats.directory of your $PRODUCT_NAME installation. $PRODUCT_NAME puts several dozen search engines in there to begin with (the big, well-known ones), but you can add as many more as you like, by editing the file with a text editor. Just add a new line for each new search engine, and the next time $PRODUCT_NAME processes log data, it will recognize those search engines, and it will include them in the database.
The \"name\" value for a search engine name of the search engine; put whatever you want the search engine to be called there. That's what will appear in the statistics. The \"substring\" value is a \"quick check\" that $PRODUCT_NAME uses to check if a URL might be a URL from that search engine. If the URL contains the \"quick check\" string, $PRODUCT_NAME then does a slower check using the \"regexp\" column, which is a regular expression. If the regular expression matches, $PRODUCT_NAME uses the parenthesized section of the regular expression as the search terms (it should be a series of search terms, separated by plusses (+)). The parenthesized section is used to compute the search terms and search phrases statistics.
You might notice that the \"substring\" column is redundant -- $PRODUCT_NAME doesn't really need it at all, since it could just check every URL with the regular expression. The reason that second column is there is that regular expressions are relatively slow -- $PRODUCT_NAME can process log data much faster if it doesn't have to check every URL in the log data against dozens of regular expressions. This way, it only has to use the regular expressions on a tiny proportion of the URLs that it sees.
" } recentstats = { label = "Recent Statistics" question = "How can I see just the most recent day/week/month of statistics?" short_answer = "Use the Calendar, or the Filters, or use arecentdays
filter on the command line."
long_answer = "In the reports, you can go to the Calendar view and click on a recent day, week, or month to see the statistics for that time period. You can also edit the global filters to zoom in on any collection of months or days, including the most recent ones.
However, filters made in that manner will not move forward as the date changes.
If you want a statistics filter that will always show the most recent seven days, automatically,
then you will need to use the command line, or edit the profile file manually.
$PRODUCT_NAME's command-line filtering options are slightly more powerful than the filtering
options available from the web interface. Though it's not possible in the web interface to
create a filter which always shows the last seven days,
it is possible to do this
from the command line, using a recentdays:N
filter on the date/time field.
For instance, to send email showing the past seven days of data, use a command line this:
$PRODUCT_EXECUTABLE -rfcf $config -cm svbe -f \"recentdays:7\"
It is also possible to use this kind of a filter in a profile file, by editing
the file manually. So for instance, if you want to use a recentdays
filter
on a particular report or report element always shows the most recent seven days of data,
you can edit
the profile file (in the profiles $lang_stats.directory of LogAnalysisInfo) to change the \"filters\"
value within the report or report_element node to recentdays:7
(create a note called \"filters\" if one does not already exist).
" } changingtimezone = { label = "Changing the Time Zone in Statistics" question = "My server logs times in GMT, but I'm in a different time zone. How can I get the statistics in my own time zone?" short_answer = "Set the date_offset option in the profile." long_answer = "$PRODUCT_NAME reports times exactly as they appear in the log data -- if the time shows up as 8:00 AM in the log data, that hit will appear as 8:00 AM in the statistics. Since servers sometimes log in GMT, or some other time zone from where $PRODUCT_NAME is running, you may want to offset the times in your statistics to match your own time zone, rather than the server's time zone or GMT. This is easily done using the date_offset option in the profile file (the profile file is in the profiles folder of LogAnalysisInfo). The number of hours specified in that option is added to the date/time, so if it's a negative number, it moves times backwards, and if it's positive, it moves them forwards. For instance, if you're 8 hours behind GMT (GMT-0800), and your server logs in GMT, you can set this value to -8 to get statistics in your own time zone. This option affects log entries are they are processed, so you'll need to rebuild the database after setting this option, to see the changes in the statistics. " } exporttable = { label = "Exporting Data From Statistics" question = "Can I export the data from $PRODUCT_NAME reports to Excel or other programs?" short_answer = "Yes; click the \"export\" link in the toolbar above reports to export the data from that report's table in CSV format. Many programs, including Excel, can import CSV format files." long_answer = "$PRODUCT_NAME supports CSV export of any table. Just view the statistics, find the table you want, and click the \"export\" link in the toolbar. Save the resulting file from your browser, and import it into Excel or any other program that supports CSV.
You can also generate CSV from the command line, like this:
$PRODUCT_EXECUTABLE -p profilename -a ect -rn \"view-name\"
for instance,
$PRODUCT_EXECUTABLE -p MyConfig -a ect -asv \"Pages\"
You can also use the -f option ({=docs_chapter_link('filters')=}) on the command line to use filters on the table data.
" } hitfiles = { label = "Showing All Files" question = "How can I see all files that were hit on my web site, not just the pages?" short_answer = "Delete or disable the 'Strip non-page-views' log filter, and rebuild the database" long_answer = "By default, $PRODUCT_NAME does not track the hits on individual image files and other non-page files when analyzing web log data, to save space in the database and reduce clutter in the \"Pages\" report. It does this by replacing the filename portion of the page with the value '(nonpage)', so all non-page hits will appear as values ending with '(nonpage)'. If you need this information, you need to tell $PRODUCT_NAME to track filenames for all hits. To do this, go to the Log Filters section of the Config section of your profile, and delete or disable the log filter called 'Strip non-page-views', which replaces the filename for non-page-view hits with '(nonpage)'. Then rebuild the database and view the reports, and all files (not just pages) will appear in the \"Pages\" and \"Pages/directories\" reports." } accuracy = { label = "Are the Statistics Accurate?" question = "I've heard that statistics like visitors, \"sessions,\" and \"paths through the site\" can't be computed accurately. Is that true? Are the statistics reported by $PRODUCT_NAME an accurate description of the actual traffic on my site?" short_answer = "$PRODUCT_NAME accurately reports the data as it appears in the log file. However, many factors skew the data in the log file. The statistics are still useful, and the skew can be minimized through server configuration." long_answer = "
$PRODUCT_NAME (and all other log analysis tools) reports statistics based on the contents of the log files. With many types of servers, the log files accurately describe the traffic on the server (i.e. each file or page viewed by a visitor is shown in the log data), but web log files are trickier, due to the effects of caches, proxies, and dynamic IP addresses.
Caches are locations outside of the web server where previously-viewed pages or files are stored, to be accessed quickly in the future. Most web browsers have caches, so if you view a page and then return in the future, your browser will display the page without contacting the web server, so you'll see the page but the server will not log your access. Other types of caches save data for entire organizations or networks. These caches make it difficult to track traffic, because many views of pages are not logged and cannot be reported by log analysis tools.
Caches interfere with all statistics, so unless you've defeated the cache in some way (see below), your web server statistics will not represent the actual viewings of the site. The logs are, however, the best information available in this case, and the statistics are far from useless. Caching means that none of the numbers you see are accurate representations of the number of pages actually views, bytes transferred, etc. However, you can be reasonably sure that if your traffic doubles, your web stats will double too. Put another way, web log analysis is a very good way of determining the relative performance of your web site, both to other web sites and to itself over time. This is usually the most important thing, anyway-- since nobody can really measure true \"hits,\" when you're comparing your hits to someone else hits, both are affected by the caching issues, so in general you can compare them successfully.
If you really need completely accurate statistics, there are ways of defeating caches. There are headers you can send which tell the cache not to cache your pages, which usually work, but are ignore by some caches. A better solution is to add a random tag to every page, so instead of loading /index.html, they load /index.html?XASFKHAFIAJHDFS. That will prevent the page from getting cached anywhere down the line, which will give you complete accurate page counts (and paths through the site). For instance, if someone goes back to a page earlier in their path, it will have a different tag the second time, and will be reloaded from the server, relogged, and your path statistics will be accurate. However, by disabling caching, you're also defeating the point of caching, which is performance optimization-- so your web site will be slower if you do this. Many choose to do it anyway, at least for brief intervals, in order to get \"true\" statistics.
The other half of the problem is dynamic IP addresses, and proxies. This affects the \"visitor\" counts, in those cases where visitors are computed based on the unique hosts. Normally, $PRODUCT_NAME assumes that each unique originating hostname or IP is a unique visitor, but this is not generally true. A single visitor can show up as multiple IP addresses if they are routed through several proxy servers, or if they disconnect and dial back in, and are assigned a new IP address. Multiple visitors can also show up as a single IP address if they all use the same proxy server. Because of these factors, the visitor numbers (and the session numbers, which depend on them) are not particularly accurate unless visitor cookies are used (see below). Again, however, it's a reasonable number to throw around as the \"best available approximate\" of the visitors, and these numbers tend to go up when your traffic goes up, so they can be used as effective comparative numbers.
As with caching, the unique hosts issue can be solved through web server profile. Many people use visitor cookies (a browser cookie assigned to each unique visitor, and unique to them forever) to track visitors and sessions accurately. $PRODUCT_NAME can be configured to use these visitor cookie as the visitor ID, by extracting the cookie using a log filter, and putting it in the \"visitor id\" field. This isn't as foolproof as the cache-fooling method above, because some people have cookies disabled, but most have them enabled, so visitor cookies usually provide a very good approximation of the true visitors. If you get really tricky you can configure $PRODUCT_NAME and/or your server to use the cookie when it's available, and the IP address when it's not (or even the true originating IP address, if the proxy passes it). Better yet, you can use the concatenation of the IP address and the user-agent field to get even closer to a unique visitor id even in cases where cookies are not available. So you can get pretty close to accurate visitor information if you really want to.
To summarize, with a default setup (caching allowed, no visitor cookies), $PRODUCT_NAME will report hits and page views based on the log data, which will not precisely represent the actual traffic to the site, and so will and any other log analysis tool. $PRODUCT_NAME goes further into the speculative realm than some tools by reporting visitors, sessions, and paths through the site. With some effort, your server can be configured to make these numbers fairly accurate. Even if you don't, however, you can still use this as valuable comparative statistics, to compare the growth of your site over time, or to compare one of your sites to another.
" } cgiuserpermissions = { label = "CGI User Permissions" question = "When I run $PRODUCT_NAME as a CGI, it runs as a special user (nobody, web, apache, etc.). Then when I want to use $PRODUCT_NAME from the command line or in web server mode, the permissions don't allow it. What can I do about this?" short_answer = "Loosen the permissions in the Preferences, or run your CGI programs as a different user, or run your command line programs as the CGI user." long_answer = "For security reasons, UNIX web servers often run CGI programs as a special user, often user nobody, or user web, or user cgi, or user apache. When you run $PRODUCT_NAME in CGI mode, it runs as this user, and any files it creates are owned by that user. This can cause problems if you later need to run $PRODUCT_NAME as a different user, for instance to run a command-line database update-- the files which were created as the CGI user will not be accessible to the non-CGI user, and you will get errors about $PRODUCT_NAME not being able to read or write certain files.
There are several possible solutions to this problem:
You can run your command lines as the CGI user. This is often the easiest solution. Of your CGI user is user nobody, then use \"su nobody\" to change to user nobody, and then run your commands as that user. Since both the CGI version and the command-line version will be running as the same user, there will be no permissions issues. You may need to configure a password, shell, and home directory for user nobody before you can log in as that user, which will require root access. This option is slightly insecure because giving user \"nobody\" a home directory and a shell makes it a slightly more powerful user; if the purpose of using \"nobody\" as the CGI user was to run CGI programs with a powerless user, this circumvents that security somewhat.
You can run your CGI program as the command-line user. If your username is \"myself\", then you can reconfigure your web server to run CGI programs as that user, rather than the user it's using now. You may even be able to configure the server to run only $PRODUCT_NAME as that user, while continuing to run other programs with the usual CGI user. Because both the CGI version of $PRODUCT_NAME and the command line version will be running as user \"myself\", there will be no permissions issues. This may be difficult to configure, however; see your web server documentation for instructions on how to configure your server to run CGI programs as a different user. On some servers, this may not be possible.
You can change the permissions of the files that $PRODUCT_NAME creates, by editing the permissions options in the Preferences. This is usually an insecure solution, however, since you'll need to loosen many of the permissions to 777 (everyone can read, write, execute/search), which makes your files vulnerable to modification by unauthorized users on the machine. This option may be acceptable, however, if access to the machine is limited to authorized users; i.e. if the only ones who can log in by telnet, SSH, FTP, etc. are those who are trusted $PRODUCT_NAME administrators.
Any one of these solutions will work; you do not need to do more than one of these.
" } frequentnewversions = { label = "Frequent New Versions of $PRODUCT_NAME" question = "Why are new versions of $PRODUCT_NAME released so often? Is it buggy? Do I need to download every new version?" short_answer = "We ship new versions to provide our customers with the latest minor features and bug fixes quickly. $PRODUCT_NAME is no buggier than any other software, and you don't need to download a new release unless you're having problems with the current one." long_answer = "We've had a few people ask us why we ship new versions of $PRODUCT_NAME so often. The reason is that we want to provide our customers with access to the latest minor features (e.g. new log formats) and bug fixes. Our shipping process is highly automated, so it is relatively easy for us to ship a new version, so we do it frequently.
There are bugs in $PRODUCT_NAME, just like there are bugs in all computer programs. Of course, we strive to keep the bugs to a minimum, but $PRODUCT_NAME is very complex software, and we get reports of a few new bugs every week. We roll these into new releases every couple weeks, and ship them so that new downloaders won't be troubled by these bugs, and people who are experiencing them will be able to get a fixed version. Other computer programs have similar numbers of bugs, but they package more bug fixes in each release, and release versions less frequently.
Unless you're having problems with the version of $PRODUCT_NAME you're currently running, if you need a new feature we've added (like support for a new log format), there is no need to upgrade. You can upgrade at whatever pace you like, and skip any upgrades in the middle; each new release of $PRODUCT_NAME is a full release, so you don't have to have any previous version installed to use it.
" } relabeling = { label = "Relabeling/White-labeling $PRODUCT_NAME" question = "I want to deploy $PRODUCT_NAME to my customers, but I want it to look like part of my site. I don't want the name $PRODUCT_NAME to appear -- I want my own name to appear. Can I relabel or white-label $PRODUCT_NAME?" short_answer = "Yes, but the degree to which you can relabel depends on your license." long_answer = "You can relabel $PRODUCT_NAME and it's not very difficult, however the extent to which you can relabel depends on the license purchased (i.e. Professional, Enterprise etc.).
$PRODUCT_NAME Professional allows easy modification of certain screen attributes within the standard End User License, attributes such as colors, fonts, etc. $PRODUCT_NAME Professional also allows the language used on-screen to be modified or translated, plus it allows the addition of a graphic item by use of the custom HTML headers and footers, however the license does not allow the removal or replacement of any $PRODUCT_NAME logos or credits etc. Should you require Professional Edition with even more customization ability and you are a qualifying user or integrator then we may be able to assist you. Under these circumstances you should forward your detailed proposal to us containing precise descriptions (preferably diagrams) of how you would wish the screens to look and we will respond.
$PRODUCT_NAME Enterprise allows very considerable customization of the user interface and statistics screens to the point that just about every on-screen item can be modified deleted or replaced, or new items introduced. This ability is allowed within the standard license which should be consulted prior to making any final changes.
You can view the $PRODUCT_NAME End User License here. Note that under all circumstances, and for each product, the License requires that you leave the Flowerfire copyright notice untouched and visible together with a visible reference to $PRODUCT_NAME on every page.
Please contact support@flowerfire.com if our standard licensing does not meet your need.
" } serverdown = { label = "$PRODUCT_NAME Server is Down" question = "I can't access $PRODUCT_NAME where I usually do (http://www.xxx.yyy.zzz:8988/) -- is your (Flowerfire's) server down?" short_answer = "No -- your server is down. $PRODUCT_NAME runs on your computer, not on ours -- contact your network administrator if you're having problems accessing it." long_answer = "$PRODUCT_NAME runs as a web server on the computer where it was installed, which is a client computer, not one of our servers. So if you're having trouble accessing $PRODUCT_NAME through your web browser, it means that your installation of $PRODUCT_NAME is messed up in some way ($PRODUCT_NAME may not be running where you expected it to be). If you installed $PRODUCT_NAME yourself, you may need to restart it. If someone else installed $PRODUCT_NAME, please contact them (it may be your network administrator) for assistance in getting $PRODUCT_NAME up and running again.
On a related note, $PRODUCT_NAME never contacts Flowerfire, or any of Flowerfire's computers. It does not transmit log data to Flowerfire, it does not transmit statistics to Flowerfire, it does not receive any information or data from Flowerfire (the sole exception being the download of the GeoIP database, if it isn't present in the installation), and in all other ways it is a complete self-contained program that does not rely on Flowerfire's servers. Because $PRODUCT_NAME runs as a web server, people often assume that $PRODUCT_NAME is actually running on the Internet, on one of our servers, but it isn't -- it runs on your computers, and does not use the Internet or the network except where you specifically ask for it (i.e. to download files by FTP when you've requested that it do so, or to send mail when you've asked it to, or to look up IP numbers using DNS when you've asked it to).
" } missingdays = { label = "Days Are Missing from the Log Data" question = "When I look at my statistics, I see that some days are missing. I know I had traffic on those days. Why aren't they shown?" short_answer = "Your ISP may be regularly deleting or rotating your log data. Ask them to leave all your log data, or rotate it over a longer interval. It's also possible that your log data does not contain those days for another reason." long_answer = "To save disk space, many ISPs delete, or \"rotate\" (rename and/or compress) the server log data regularly. For instance, instead of letting the log file grow forever, they may rename it every day, start a new one, and compress the old one; then, every week, they may delete the logs older than seven days. In other, more dramatic cases, they may simply delete the log file every month or week, and restart a new one.
Though this does save disk space on the server, it presents serious problems for log analysis. When you rebuild the database with $PRODUCT_NAME, it processes all the existing log data, and creates a new database from it. If some of the old log data has been deleted, that data will no longer be available in the statistics. So if the ISP deletes the logs every month, and you rebuild your database, your statistics will go back one month at the most.
Similarly, when you update the database, $PRODUCT_NAME adds any new data in the existing log data to the database. So if the ISP deletes log files every month, and you only update your database every month on the 15th, then all the data from the 15th to the end of each month will be missing, because it was not added through an update, and it was deleted on the 1st of the month.
The best solution is to convince your ISP to keep all of your log data, and never delete any of it. If you can do that, then there will be no problem-- you'll always be able to rebuild or update your database and get all of the statistics. Since this will require more of your ISPs disk space, however, they may not be willing to do this, especially if you have a very large site, or they may charge extra for the service. Of course, if you own and manage your own server, you can do this yourself.
The second best solution, if you can't convince the ISP to keep all log data, is to store your back log files on your own system. If your ISP rotates the data through several logs before deleting the oldest one, this is easy-- just download the logs you don't have regularly (you may be able to automate this using an FTP client). If they only keep one copy, and delete it and restart it regularly, then you'll need to download that file as close to the reset time as possible, to get as much data as possible before it is deleted. This is not a reasonable way for ISPs to rotate logs, and you should try to convince them to rotate through several files before deleting the oldest one, but some of them do it this way anyway. You'll never get all of your log data if they use this technique-- the very last entries before deletion will always be lost-- but if you time it right you can get pretty close.
Once you have the logs on your system, you can analyze that at your leisure, without worrying about them being deleted. In this situation, you'll probably want to run $PRODUCT_NAME on the system where you keep the back logs.
If your log rotation is not the issue, then it may be that your log data does not contain the data for another reason. Maybe the server was down for a period, or the log data was lost in a disk outage, or it was corrupted. Look at the log data yourself, using a text editor, to make sure that it really does contain the days that you expected it to contain. If the data isn't in your logs, $PRODUCT_NAME cannot report statistics on it.
" } v7plugin = { label = "Using version 7 plug-ins" question = "Will my plug-in work with version 8?" short_answer = "Most version 7 plug-ins will work with version 8." long_answer = "There is an issue with final_step being used in a plug-in. It is a broad problem that can arise when final_step is used in a plug-in, and relies on specific names or layout in the profile. Because final_step can contain arbitrary code, and can access or modify anything, it bypasses the structured layout of the rest of the profile, and is therefore potentially version specific, and cannot be automatically converted. While any standard v7 plug-in will work with v8, plug-ins that use final_step may not work, if they access structures which have been removed or renamed in version 8. This sort of plug-in will have to be manually modified to make it v8 compatible.
" } emailreportsoutlook = { label = "Emailed Reports in Outlook 2003" question = "Why do my emailed reports from Outlook 2003 not line up, everything is out of alignment?" short_answer = "Change the settings in Outlook to not load automatically." long_answer = "Follow these directions from the main menu in Outlook 2003.
Go to the Tools menu, select Options and then Security. In the Download Settings, click change automatic download settings, then uncheck the setting for don't download pictures or content automatically in html e-mail. Click OK to close the download images dialogue box and the options dialogue box and now view your email message. The report should look fine, with all of the headers and graphs lining up.
" } sessioncomputation = { label = "Session Computation" question = "How does $PRODUCT_NAME compute session information, like total sessions, repeat visitors, paths through the site, entry pages, exit pages, time spent per page, etc.?" short_answer = "$PRODUCT_NAME uses the visitor id field to identify unique visitors. It decides that a new session has begun if a visitor has been idle for 30 minutes." long_answer = "$PRODUCT_NAME computes session information by tracking the page, date/time, and visitor id (which is usually the originating hostname) for each page view in the log data. When a session view is requested, it processes all of these page views at the time of the request, ignoring those that are filtered out by filters on the page or date/time fields. All other hits are included-- filters on other fields are ignored in session information.
$PRODUCT_NAME groups the hits into initial sessions based on the visitor id-- it start by assuming that each visitor contributed one session. It sorts the hits by date so it has a click-by-click record of the movement of each visitor.
Then it splits the sessions, using the customizable session timeout interval (30 minutes by default). Since there is no real \"log out\" operation in HTTP, there is no way for $PRODUCT_NAME to know the real time that a user leaves the site; it can only guess by assuming that if they didn't click anything for 30 minutes, they must have left and come back. The split step, then, increases the number of sessions, resulting in possibly more than one session per visitor.
Next, $PRODUCT_NAME discards sessions over 2 hours long (this is configurable). The idea behind this is that most web sessions are considerably shorter than that, so there's a good chance that any really long session is actually caused by multiple visitors using the same proxy server to visit the site. That looks like one long session because all of the hits seem to come from the proxy server. $PRODUCT_NAME rejects these because there is no way to tell which hits were from a particular visitor. If you're using visitor cookies to track unique visitors, this will not be a problem, so you can turn this option to a high value to see all your sessions, even those over 2 hours.
Finally, $PRODUCT_NAME discards sessions based on the Session Filters (which you can set in the Session Filters bar at the top of the statistics). The session filters can be set to discard all sessions except those from a particular visitor, or they can be set to discard all sessions except those which go through a particular page.
After that, $PRODUCT_NAME is ready to generate the statistics reports. The \"Sessions Overview\" report is generated by examining the sessions in various ways (for instance, the repeat visitors number is the number of visitors which have more than one session; i.e. those whose sessions were \"split\" by the timeout interval). The \"enty pages\" and \"exit pages\" report is generated by tabulating the first and last pages of every session. The \"session pages\" report is generated by finding every occurrence of each page in any session, computing how long it was from then until the next page in that session (exit pages are considered to have zero time spent per page), and tabulating the results for all pages to compute time per page and other statistics. The \"paths (clickstreams)\" report shows all the sessions in a single expandable view.
" } visitors_vs_session_users = { label = "Visitors vs. Session Users" question = "Why doesn't the number of visitors in the Overview match the number of session users in the \"Sessions Overview\" report?" short_answer = "Session information only shows users contributing page views, and other views show all visitors. Also, long sessions are discarded from the session information." long_answer = "The configuration database is split into two major sections: the main statistics, and the session information. The main statistics contains information on all hits; the session information shows the \"sessions\" -- i.e. it tracks the sequence of page views of each person who visits the site. Most views show the main statistics; only the session-related views (Sessions (summary), Sessions, Paths (clickstreams), Paths through a page, Entry pages, Exit pages, Paths through a page, Session pages, etc.) show the session information. Because these two types of data are computed differently, the numbers may vary between the two.
There are two major factors that affect the session users, but do not affect the visitors. First, session information is based on page views only, while visitor information is computed based on all hits in the database. So for instance, if the web site is accessed by a browser that fetches only a single image file, and never hits a page, that hit (and that host) will appear in the main statistics, but not in the session statistics. To put it another way, the visitors are the number of unique hosts who contributed hits; the session users are the number of unique hosts contributing page views. If your database is set up to track hits or bandwidth, these numbers may be significantly different. if your database tracks only page views, then visitor information will also be based on page views, and visitors and session users will be closer.
" } wrongyear = { label = "Years are wrong in the statistics" question = "The statistics show the wrong years -- when I analyze data from previous years, it appears as this year, or data from this year appears in last year. Why?" short_answer = "Your log format does not include year information, so $PRODUCT_NAME has to guess the year. Use a different log format if possible (one which includes year information). See the long answer for a way of manually setting the year for blocks of log data." long_answer = "Most log formats include the year as part of the date on every line, but a few (in particular, Unix Syslog format) include only month and day. In this situation, $PRODUCT_NAME has no way of knowing which year a particular event occurred in, so it has to guess. Recent versions of $PRODUCT_NAME will always guess that the event occurred in the current year; previous versions may have a particular year hard-coded in the default_log_date_year option in the profile, and will put all events in that year.
The best solution, if possible, is to use a different log format--use a log format that has year information. Then $PRODUCT_NAME will always categorize events in the correct year.
If that's not an option, then you will need to help $PRODUCT_NAME to know which data belongs in which year. There are several options, but the easiest one, if you are using Unix Syslog format, is to rename your log files so they end in yyyy.log, where yyyy is the year the log data is from. If some logs span multiple years, you will need to split those logs into files which do not cross year boundaries. For instance, if you have mail.log which contains data from 2004, 2005, and 2006, you can split it into three files, mail_2004.log, mail_2005.log, and mail_2006.log. The Unix Syslog plug-in automatically recognizes filenames which end with yyyy.log, and uses that value as the year when no year is available in the log data.
Another option, also for logs written by Unix Syslog, is available if the message part of each log line contains a full date, including year. For instance, some logging devices include \"date=2006-02-01\" in the log data, indicating the date of the event. In this case, even though the syslog format may not have the year, the device plug-in can extract the year from the message. This is usually a simple modification of the plug-in, but not all plug-ins have been modified to support this yet. If your log data contains year information in the message, but the reports show data from the log year, please contact $SUPPORT_EMAIL and we will add extraction of years from the message of your format (include a small sample of log data, as a compressed attachment).
Another option is to put the data in $lang_stats.directories by year; e.g. put all your 2005 data in a $lang_stats.directory called /logs/2005, and all your 2006 log data in a $lang_stats.directory called /logs/2006, and then process the data in stages using the following command lines:
$PRODUCT_EXECUTABLE_DOCS -p profilename -a bd log.source.0.pathname /logs/2005 log.processing.default_log_date_year 2005 $PRODUCT_EXECUTABLE_DOCS -p profilename -a ud log.source.0.pathname log.processing.default_log_date_year 2006
The first command creates a database using all the data from 2005, using 2005 as the date. The second command processes
all the data from 2005, adding it to the existing database, using 2006 as the date. The final result is that you have
a database which has 2005 data in 2005 and 2006 data in 2006. From then on, you can update your database normally,
and the new log data (from the most recent day) will be correctly categorized in the current year.
If new data continues to be added in the wrong year, make sure that the default_log_date_year
option
is set to thisyear
in your profile .cfg file (in LogAnalysisInfo/profiles), and in LogAnalysisInfo/default_profile.cfg.
"
}
sawmillname = {
label = "The Name \"$PRODUCT_NAME\""
question = "Where did the name \"$PRODUCT_NAME\" come from?"
short_answer = "A sawmill is a tool that processes logs, and so is $(PRODUCT_NAME)."
long_answer = "
A sawmill is a tool that processes logs (the kind made from trees), and so is $PRODUCT_NAME (it processes web server logs).
" } sgsexport = { label = "Exporting Symantec SGS/SEF data to text format" question = "$PRODUCT_NAME does not recognize my Symantex SGS/SEF log data, because it is binary. How can I export this data to a text format so $PRODUCT_NAME can process it?" short_answer = "Use flatten8, or remorelog8" long_answer = `The Symantec Security Gateways plug-in is based on a text export of a binary data file on the SGS/SEF device.
To use "remotelogfile8.exe" to extract the text log from the binary data:
1. Browse to "http://www.symantec.com/search/"
2. search for document "2004021815290054"
To use the "flatten8" utility to extract the text log from the binary data:
1. Review page 102 of "Symantec™ Security Gateways - Reference Guide" - Version 8, this is an excerpt:
Flatten utility
The flatten8 utility is shipped on the included CD and lets you perform simple log file management from the command-line. The flatten8 utility reads in the log message information from the system’s XML files, and then parses in real-time the binary log file, substituting the actual error text message for its binary counterpart.
Most often, this utility is used to convert the binary log file to a more usable format for a third party utility, such as an ASCII text editor. This utility is also used to review the most recent messages, or directed to
show just statistics messages.