StructureCMS

February 25, 2010

Rediscovering MySQL

Filed under: ColdFusion, Database — joel.cass @ 9:14 am

Recently I have had to create a reporting system for some server log files. It was sort of an ad-hoc thing; It was really done in a rush because we couldn’t get AWStats to process the files properly and the customer had a really steep deadline to meet.

Anyway, I started off by creating a system that can read in log files following a regular expression. It would then add the contents of these logs to the database for reporting purposes. The code I used is below:

<cfsetting enablecfoutputonly="true" requesttimeout="864000" />

<cfparam name="logpath" default="#expandPath('logs/')#" />
<cfparam name="filter" default="^nc([0-9]*)\.log$" />

<cfdirectory action="list" directory="#logpath#" name="files" sort="name DESC" />

<cfset regex = "^([^ ]*) ([^ ]*) ([^ ]*) (\[[^\]]*\]) ""([^ ]*) (.*) ([^ ]*)"" ([^ ]*) ([^ ]*)$" />
<cfset date_regex = "^\[([0-9]*)/([A-Za-z]*)/([0-9]*)\:([0-9]*)\:([0-9]*)\:([0-9]*).*\]$" />

<cfset month_map = structNew() />
<cfset month_map["Jan"] = "01" />
<cfset month_map["Feb"] = "02" />
<cfset month_map["Mar"] = "03" />
<cfset month_map["Apr"] = "04" />
<cfset month_map["May"] = "05" />
<cfset month_map["Jun"] = "06" />
<cfset month_map["Jul"] = "07" />
<cfset month_map["Aug"] = "08" />
<cfset month_map["Sep"] = "09" />
<cfset month_map["Oct"] = "10" />
<cfset month_map["Nov"] = "11" />
<cfset month_map["Dec"] = "12" />

<cfif structKeyExists(URL, "resetDB")>
  <cfquery datasource="#application.dsn#">
  DROP TABLE #application.tablename#;
  CREATE TABLE #application.tablename#(
    [id] [bigint] NOT NULL,
    [ip] [nvarchar](50) NULL,
    [datetime] [datetime] NULL,
    [url] [ntext] NULL,
    [url_hash] [nvarchar](50) NULL,
    [method] [nvarchar](50) NULL,
    [status] [nvarchar](50) NULL,
    [size] [nvarchar](50) NULL,
    [unknown1] [nvarchar](50) NULL,
    [unknown2] [nvarchar](50) NULL,
  CONSTRAINT [PK_#application.tablename#] PRIMARY KEY CLUSTERED (
    [id] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    IGNORE_DUP_KEY = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS = ON
  ) ON [PRIMARY]
  ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_date] ON [dbo].[#application.tablename#] (
    [datetime] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS = ON
  ) ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_ip] ON [dbo].[#application.tablename#] (
    [ip] ASC
  ) WITH (
    PAD_INDEX = OFF,
    STATISTICS_NORECOMPUTE = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS = ON,
    ALLOW_PAGE_LOCKS  = ON
  ) ON [PRIMARY];

  CREATE NONCLUSTERED INDEX [idx_#application.tablename#_url] ON [dbo].[#application.tablename#] (
    [url_hash] ASC
  ) WITH (
    PAD_INDEX  = OFF,
    STATISTICS_NORECOMPUTE  = OFF,
    SORT_IN_TEMPDB = OFF,
    IGNORE_DUP_KEY = OFF,
    DROP_EXISTING = OFF,
    ONLINE = OFF,
    ALLOW_ROW_LOCKS  = ON,
    ALLOW_PAGE_LOCKS  = ON
  ) ON [PRIMARY];
  </cfquery>
</cfif>

<cfloop query="files">

  <cfif REFindNoCase(filter, files.name)>

      <cfset i = 0 />
      <cfset strFile = files.name />
      <cfset strFileId = REReplace(strFile, filter, "\1") />
      <cfset objSystem = CreateObject("java", "java.lang.System") />
      <cfset getMaxId = "" />
      <cfset start_i = 0 />

      <cflock name="GenerateStats_#application.sitename#_#strFileId#" type="exclusive" timeout="10">

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Started" />

        <cfquery name="getMaxId" datasource="#application.dsn#">
        SELECT
          max(id) as n
        FROM
          #application.tablename#
        WHERE
          id >= <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="1#strFileId##numberFormat(0,'0000000000')#">
        AND
          id <= <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="1#strFileId##numberFormat(9999999999,'0000000000')#">
        </cfquery>

        <cfif getMaxId.recordcount GT 0 AND getMaxId.n GT 0>
          <cfset start_i = getMaxId.n - "1#strFileId#0000000000" />
        </cfif>

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Log start = #start_i#" />

        <cfloop file="#logpath#\#strFile#" index="line">
          <cfset i = i + 1 />

          <cfif i GT start_i>

            <cfset strId = "1#strFileId##numberFormat(i,'0000000000')#" />

            <cftry>

              <cfset strIp       = REReplaceNoCase(line, regex, "\1") />
              <cfset strUnknown1 = REReplaceNoCase(line, regex, "\2") />
              <cfset strUnknown2 = REReplaceNoCase(line, regex, "\3") />
              <cfset strDatetime = REReplaceNoCase(line, regex, "\4") />
              <cfset strUrl      = REReplaceNoCase(line, regex, "\6") />
              <cfset strMethod   = REReplaceNoCase(line, regex, "\5") />
              <cfset strStatus   = REReplaceNoCase(line, regex, "\8") />
              <cfset strSize     = REReplaceNoCase(line, regex, "\9") />

              <cfset dtDateTime = CreateDateTime(
                REReplaceNoCase(strDatetime, date_regex, "\3"),
                month_map[REReplaceNoCase(strDatetime, date_regex, "\2")],
                REReplaceNoCase(strDatetime, date_regex, "\1"),
                REReplaceNoCase(strDatetime, date_regex, "\4"),
                REReplaceNoCase(strDatetime, date_regex, "\5"),
                REReplaceNoCase(strDatetime, date_regex, "\6")
              ) />

              <cfquery datasource="#application.dsn#">
              INSERT INTO #application.tablename# (
                id,
                ip,
                unknown1,
                unknown2,
                datetime,
                url,
                url_hash,
                method,
                status,
                size
              ) VALUES (
                <cfqueryparam cfsqltype="CF_SQL_BIGINT" value="#strId#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strIp#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUnknown1#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUnknown2#" />,
                <cfqueryparam cfsqltype="CF_SQL_TIMESTAMP" value="#dtDateTime#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strUrl#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#hash(strUrl)#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strMethod#" />,
                <cfqueryparam cfsqltype="CF_SQL_VARCHAR" value="#strStatus#" />,
                <cfqueryparam cfsqltype="CF_SQL_INTEGER" value="#strSize#" />
              )
              </cfquery>

              <cfcatch>
                <cflog file="GenerateStats_#application.sitename#" text="Error on #strFile# line #i#: #cfcatch.message# (#cfcatch.detail#)" />
              </cfcatch>

            </cftry>

          </cfif>

          <cfif i MOD 500 EQ 0>
            <cfif fileexists(expandPath("stop.file"))>
              <cflog file="GenerateStats_#application.sitename#" text="#strFile# Aborted: Stop file exists." />
              <cfbreak />
            </cfif>
            <cfset objSystem.GC() />
            <cfthread action="sleep" duration="100" />
          </cfif>

          <cfif i MOD 10000 EQ 0>
            <cflog file="GenerateStats_#application.sitename#" text="#strFile# Processed #numberformat(i)# lines" />
          </cfif>

        </cfloop>

        <cflog file="GenerateStats_#application.sitename#" text="#strFile# Completed (#i# lines)" />

      </cflock>

  </cfif>

  <cfif fileexists(expandPath("stop.file"))>
    <cfbreak />
  </cfif>

</cfloop>

<cfsetting enablecfoutputonly="false" />

This was going OK with SQL Server Express. The logs would be imported and then analysed. Whilst it wasn’t blazingly fast, it could import a few hundred meg of logs data in a few minutes so I could start generating reports.

Then one day I got the request to do the same thing but for the company’s most busiest site – their intranet. Each log file exceeded a gig, which often meant over 7 million records per file. Unfortunately, SQL Server Express was not keeping up all too well, it would start off well, processing around 500 records a second, but as time went by this crawled down to <10 records per second, PLUS, after an overnight run I realised that SQL Server stopped accepting records, with the data file blown out to 4GB at just 3 million records. That's only HALF a log file!

I started looking at my options. The company I work at loves SQL Server, so I thought of ways to make it work. Hmm. Storing less fields? Sacrificing my indexes? Those are not good options. Then, I thought back to my old days of using MySQL. I remember how spastic it used to be, joins were always slow, queries had to be arranged so that they would be optimised in a certain way etc etc. But then, my app is only using one table. My queries are basic (for the most part). It was worth a try.

So, I started again. I created the table in InnoDB. That was a bad move, it was slow from the start at only 40 records per sec. But then, I'm the only one using this app. I don't need transactions, why am I using InnoDB? The answer was clear - try MyISAM. So I dropped and recreated the table as MyISAM and restarted the import. Wow - instantly the result was clear - over 1000 records/sec!

Here is the equivalent MySQL code:

	<cfquery datasource=”#application.dsn#”>
	DROP TABLE #application.tablename#;
	</cfquery>
	<cfquery datasource=”#application.dsn#”>
	CREATE TABLE #application.tablename# (
	  id BIGINT UNSIGNED NOT NULL,
	  ip VARCHAR(50) NULL,
	  datetime DATETIME NULL,
	  url TEXT NULL,
	  url_hash VARCHAR(50) NULL,
	  method VARCHAR(50) NULL,
	  status VARCHAR(50) NULL,
	  size VARCHAR(50) NULL,
	  unknown1 VARCHAR(50) NULL,
	  unknown2 VARCHAR(50) NULL,
	  PRIMARY KEY (id),
	  INDEX idx_#application.tablename#_date(datetime),
	  INDEX idx_#application.tablename#_ip(ip),
	  INDEX idx_#application.tablename#_url(url_hash)
	) ENGINE=MyISAM;
	</cfquery>

The import has now been running overnight and there is no performance degradation. Imports are still running between 800-1000 records/sec and the table now contains 48 million records! And the file size? 10GB including indexes - this is only 2 and a half times the SQL Server file yet it is holding 16 times the amount of data! AND I have just started running queries - queries that took over 60 seconds to execute in SQL server on less than a million rows are now taking a similar time but I have over 48 million records in MySQL.

I think this is a perfect case of the right tool for the right purpose. MySQL - you're a life saver.

No Comments »

No comments yet.

RSS feed for comments on this post. TrackBack URL

Leave a comment