StructureCMS

February 25, 2010

Rediscovering MySQL

Filed under: ColdFusion, Database — joel.cass @ 9:14 am

Recently I have had to create a reporting system for some server log files. It was sort of an ad-hoc thing; It was really done in a rush because we couldn’t get AWStats to process the files properly and the customer had a really steep deadline to meet.

Anyway, I started off by creating a system that can read in log files following a regular expression. It would then add the contents of these logs to the database for reporting purposes. The code I used is below:

<cfsetting enablecfoutputonly="true" requesttimeout="864000" />

<cfparam name="logpath" default="#expandPath('logs/')#" />
<cfparam name="filter" default="^nc([0-9]*)\.log$" />

<cfdirectory action="list" directory="#logpath#" name="files" sort="name DESC" />

<cfset regex = "^([^ ]*) ([^ ]*) ([^ ]*) (\[[^\]]*\]) ""([^ ]*) (.*) ([^ ]*)"" ([^ ]*) ([^ ]*)$" />
<cfset date_regex = "^\[([0-9]*)/([A-Za-z]*)/([0-9]*)\:([0-9]*)\:([0-9]*)\:([0-9]*).*\]$" />

<cfset month_map = structNew() />
<cfset month_map["Jan"] = "01" />
<cfset month_map["Feb"] = "02" />
<cfset month_map["Mar"] = "03" />
<cfset month_map["Apr"] = "04" />
<cfset month_map["May"] = "05" />
<cfset month_map["Jun"] = "06" />
<cfset month_map["Jul"] = "07" />
<cfset month_map["Aug"] = "08" />
<cfset month_map["Sep"] = "09" />
<cfset month_map["Oct"] = "10" />
<cfset month_map["Nov"] = "11" />
<cfset month_map["Dec"] = "12" />

<cfif structKeyExists(URL, "resetDB")>
  <cfquery datasource="#application.dsn#">
  DROP TABLE #application.tablename#;
  CREATE TABLE #application.tablename#(
    [id] [bigint] NOT NULL,
    [ip] [nvarchar](50) NULL,
    [datetime] [datetime] NULL,
    [url] [ntext] NULL,
    [url_hash] [nvarchar](50) NULL,
    [method] [nvarchar](50) NULL,
    [status] [nvarchar](50) NULL,
    [size] [nvarchar](50) NULL,
    [unknown1] [nvarchar](50) NULL,
    [unknown2] [nvarchar](50) NULL,
  CONSTRAINT [PK_#application.tablename#] PRIMARY KEY CLUSTERED (
    [id] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    IGNORE_DUP_KEY = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS = ON
  ) ON [PRIMARY]
  ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_date] ON [dbo].[#application.tablename#] (
    [datetime] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS = ON
  ) ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_ip] ON [dbo].[#application.tablename#] (
    [ip] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS  = ON
  ) ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_url] ON [dbo].[#application.tablename#] (
    [url_hash] ASC
  ) WITH (
    PAD_INDEX  = OFF,
    STATISTICS_NORECOMPUTE  = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS  = ON,
    ALLOW_PAGE_LOCKS  = ON
  ) ON [PRIMARY];
  </cfquery>
</cfif>

<cfloop query="files">

  <cfif REFindNoCase(filter, files.name)>

      <cfset i = 0 />
      <cfset strFile = files.name />
      <cfset strFileId = REReplace(strFile, filter, "\1") />
      <cfset objSystem = CreateObject("java", "java.lang.System") />
      <cfset getMaxId = "" />
      <cfset start_i = 0 />

      <cflock name="GenerateStats_#application.sitename#_#strFileId#" type="exclusive" timeout="10">

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Started" />

        <cfquery name="getMaxId" datasource="#application.dsn#">
        SELECT
          max(id) as n
        FROM
          #application.tablename#
        WHERE
          id >= <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="1#strFileId##numberFormat(0,'0000000000')#">
        AND
          id <= <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="1#strFileId##numberFormat(9999999999,'0000000000')#">
        </cfquery>

        <cfif getMaxId.recordcount GT 0 AND getMaxId.n GT 0>
          <cfset start_i = getMaxId.n - "1#strFileId#0000000000" />
        </cfif>

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Log start = #start_i#" />

        <cfloop file="#logpath#\#strFile#" index="line">
          <cfset i = i + 1 />

          <cfif i GT start_i>

            <cfset strId = "1#strFileId##numberFormat(i,'0000000000')#" />

            <cftry>

              <cfset strIp       = REReplaceNoCase(line, regex, "\1") />
              <cfset strUnknown1 = REReplaceNoCase(line, regex, "\2") />
              <cfset strUnknown2 = REReplaceNoCase(line, regex, "\3") />
              <cfset strDatetime = REReplaceNoCase(line, regex, "\4") />
              <cfset strUrl      = REReplaceNoCase(line, regex, "\6") />
              <cfset strMethod   = REReplaceNoCase(line, regex, "\5") />
              <cfset strStatus   = REReplaceNoCase(line, regex, "\8") />
              <cfset strSize     = REReplaceNoCase(line, regex, "\9") />

              <cfset dtDateTime = CreateDateTime(
                REReplaceNoCase(strDatetime, date_regex, "\3"),
                month_map[REReplaceNoCase(strDatetime, date_regex, "\2")],
                REReplaceNoCase(strDatetime, date_regex, "\1"),
                REReplaceNoCase(strDatetime, date_regex, "\4"),
                REReplaceNoCase(strDatetime, date_regex, "\5"),
                REReplaceNoCase(strDatetime, date_regex, "\6")
              ) />

              <cfquery datasource="#application.dsn#">
              INSERT INTO #application.tablename# (
                id,
                ip,
                unknown1,
                unknown2,
                datetime,
                url,
                url_hash,
                method,
                status,
                size
              ) VALUES (
                <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="#strId#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strIp#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUnknown1#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUnknown2#" />,
                <cfqueryparam cfsqltype="CF_SQL_TIMESTAMP" value="#dtDateTime#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUrl#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#hash(strUrl)#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strMethod#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strStatus#" />,
                <cfqueryparam cfsqltype="CF_SQL_INTEGER" value="#strSize#" />
              )
              </cfquery>

              <cfcatch>
                <cflog file="GenerateStats_#application.sitename#" text="Error on #strFile# line #i#: #cfcatch.message# (#cfcatch.detail#)" />
              </cfcatch>

            </cftry>

          </cfif>

          <cfif i MOD 500 EQ 0>
            <cfif fileexists(expandPath("stop.file"))>
              <cflog file="GenerateStats_#application.sitename#" text="#strFile# Aborted: Stop file exists." />
              <cfbreak />
            </cfif>
            <cfset objSystem.GC() />
            <cfthread action="sleep" duration="100" />
          </cfif>

          <cfif i MOD 10000 EQ 0>
            <cflog file="GenerateStats_#application.sitename#" text="#strFile# Processed #numberformat(i)# lines" />
          </cfif>

        </cfloop>

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Completed (#i# lines)" />

      </cflock>

  </cfif>

  <cfif fileexists(expandPath("stop.file"))>
    <cfbreak />
  </cfif>

</cfloop>

<cfsetting enablecfoutputonly="false" />

This was going OK with SQL Server Express. The logs would be imported and then analysed. Whilst it wasn’t blazingly fast, it could import a few hundred meg of logs data in a few minutes so I could start generating reports.

Then one day I got the request to do the same thing but for the company’s most busiest site – their intranet. Each log file exceeded a gig, which often meant over 7 million records per file. Unfortunately, SQL Server Express was not keeping up all too well, it would start off well, processing around 500 records a second, but as time went by this crawled down to <10 records per second, PLUS, after an overnight run I realised that SQL Server stopped accepting records, with the data file blown out to 4GB at just 3 million records. That's only HALF a log file!

I started looking at my options. The company I work at loves SQL Server, so I thought of ways to make it work. Hmm. Storing less fields? Sacrificing my indexes? Those are not good options. Then, I thought back to my old days of using MySQL. I remember how spastic it used to be, joins were always slow, queries had to be arranged so that they would be optimised in a certain way etc etc. But then, my app is only using one table. My queries are basic (for the most part). It was worth a try.

So, I started again. I created the table in InnoDB. That was a bad move, it was slow from the start at only 40 records per sec. But then, I'm the only one using this app. I don't need transactions, why am I using InnoDB? The answer was clear - try MyISAM. So I dropped and recreated the table as MyISAM and restarted the import. Wow - instantly the result was clear - over 1000 records/sec!

Here is the equivalent MySQL code:

	<cfquery datasource=”#application.dsn#”>
	DROP TABLE #application.tablename#;
	</cfquery>
	<cfquery datasource=”#application.dsn#”>
	CREATE TABLE #application.tablename# (
	  id BIGINT UNSIGNED NOT NULL,
	  ip VARCHAR(50) NULL,
	  datetime DATETIME NULL,
	  url TEXT NULL,
	  url_hash VARCHAR(50) NULL,
	  method VARCHAR(50) NULL,
	  status VARCHAR(50) NULL,
	  size VARCHAR(50) NULL,
	  unknown1 VARCHAR(50) NULL,
	  unknown2 VARCHAR(50) NULL,
	  PRIMARY KEY (id),
	  INDEX idx_#application.tablename#_date(datetime),
	  INDEX idx_#application.tablename#_ip(ip),
	  INDEX idx_#application.tablename#_url(url_hash)
	) ENGINE=MyISAM;
	</cfquery>

The import has now been running overnight and there is no performance degradation. Imports are still running between 800-1000 records/sec and the table now contains 48 million records! And the file size? 10GB including indexes - this is only 2 and a half times the SQL Server file yet it is holding 16 times the amount of data! AND I have just started running queries - queries that took over 60 seconds to execute in SQL server on less than a million rows are now taking a similar time but I have over 48 million records in MySQL.

I think this is a perfect case of the right tool for the right purpose. MySQL - you're a life saver.

February 19, 2010

Another way to stop wordpress spammers

Filed under: PHP — joel.cass @ 9:22 am

It seems that getting a good ranking on a search engine can be a double-edged sword. Whilst it helps people find you, it also helps nasty spammers to find a way into your site so they can post their comments. Sure, wordpress does have some spam filtering ability thanks to the akismet plugin, but it would be so mauch better if they can be stopped at the source.

One way I have stopped spammers on my site is by implementing a text field inside a hidden block, as follows:

<!-- would recommend that this actually goes into a CSS file -->
<style type="text/css">
	.spam-check { display:none; }
</style>
..........................
<!-- spam detection -->
<p class="spam-check">
	<input type="text" name="spamcheck" id="fldSpamCheck" value="">
	<label for="fldSpamCheck">Please leave this field blank</label>
</p>
<!-- /spam detection -->

..and then on the processing side I would do something as follows:

// jnet spam detection
if (!isset($_POST['spamcheck']) || $_POST['spamcheck'] != "") {
	die('Error: please do not fill in the field that tells you not to fill it in.');
}
// end jnet spam detection

The beauty of it is that there are no CAPTCHA’s involved, and no thinking on the user’s side. The common trap that these spambots fall into is that they fill out all the fields in the form with useless garbage. Because this field is meant to be blank, the submission fails.

Furthermore, any user with a CSS enabled browser does not see the field. A user who can see the field can tell from the label that it is not meant to be filled in. So everyone wins! (Except the nasty spammers).

In wordpress, you will need to add the first code block to the files:

  • /wp-content/themes/[your_theme]/comments.php
  • /wp-content/themes/[your_theme]/comments-popup.php

…and you will need to add the second code block to the following file:

  • /wp-comments-post.php

…and you’ll be done! At least until the spammers figure out how to get around it.

February 2, 2010

Enable AJAX on almost any site using jQuery

Filed under: AJAX — joel.cass @ 11:47 am

A website I was working on a while back had a requirement for an animated background to be running whilst users were on the website. The main problem was, that every time a user would click on a new page, the background would restart, interrupting the user experience and slowing things down in general. So what do we do? Inline frames? Create an AJAX service that returns the site content?

No, the answer is even simpler. Enter AJAX using jQuery. All you need is to have two div elements with identifiers surrounding the main content of your page, for example:

Example page structure

Example page structure

If you have the above structure and have the jQuery libraries loaded, then all you need to do is implement the following function:

function loadPage(URL) {
    $("#content-outer").load(URL + " #content", null, ajax_loaded);
    return false;
}

JQuery uses css selectors to identify elements, e.g. <div id=”content”> = #content, so what this function does is it looks up the element #content-outer, executes the AJAX request, and then looks up the #content element from the next page and replaces the contents of the #content-outer element with the new element, as demonstrated in the below diagram.

How requests are made using jQuery

How requests are made using jQuery

I have also added the following routine that will be called on startup. It effectively adds the loadPage(URL) method to each link’s onclick method. Because loadPage(URL) returns false, when the link is clicked, the AJAX call will be initialised and the user’s action will be cancelled.

function check_on_load() {
    // update links
    updateLinks("A");
}

$(document).ready(check_on_load);

function updateLinks(DOMLocation) {
    var links = $(DOMLocation);
    var baseURL = location.protocol + "//" + location.host + "/";
    for (var i = 0; i < links.length; i++) {
        var el = links[i]
        /* exclusions to the rule
         * - onclick must be null,
         * - url must contain the site's base url (http://site_domain)
         * - url cannot already have an anchor
         * - target must be empty
         * - classname must not contain 'no-ajax'
        */
        if (el.onclick == null &&
                el.href.indexOf(baseURL) == 0 &&
                el.href.indexOf("#") < 0 &&
                el.target == "" &&
                el.className.toLowerCase().indexOf("no-ajax") < 0) {
            el.onclick = Function("return loadPage(this.href);");
        }
    }
}

Furthermore, I have also added a callback to the method which will update the page title and replace any links within the new page content.

function ajax_loaded(responseText, textStatus, XMLHttpRequest) {
    if (textStatus == "error") {
        prompt("URL Failed: ", XMLHttpRequest);
    } else {
        // set title
        document.title = titleFromHTML(responseText);
        // update links
        updateLinks("#content-outer A");
    }
}

function titleFromHTML(HTML) {
    var regex = new RegExp("");
    var matches = regex.exec(HTML);
    if (matches.length > 1) {
        return matches[1];
    } else {
        return "";
    }
}

...And the best part? By simply including the script within your page along with jQuery (and making sure that your structure is correct), you will instantly enable AJAX-based loading on your page. Even better, if your browser does not fully support AJAX, javascript, or jQuery, all links will remain untouched and function just as they normally do.

Download jQuery here: http://docs.jquery.com/Downloading_jQuery
Download the script here: ajax-plugin_no_history.js
Demo: demo_no_history.htm

The only thing I have left out of this is history management. By using the script above, users lose the ability to switch back and forth between pages. By using the jQuery history plugin, you can give users the ability to switch back and forth between pages, and also the ability to bookmark AJAX'd pages.

Download jQuery history plugin here: http://www.mikage.to/jquery/jquery_history.html
Download the history enabled script here: ajax-plugin_with_history.js
Demo: demo_with_history.htm