Hello,
I have written a small app to parse web log files and extract certain
lines to another file. There is also functionality to count all the
items that are being filtered out.
I wrote this in c# instead of in perl because the log files are 3-4GB
and I want faster processing than perl would typically provide. And,
I'm learning c#.
There are two issues I would like to address: improve the speed of the
file i/o and control the processing. Right now, this app takes about 20
min to process a 3GB file on a laptop with a 2Ghz proc and 2GB RAM.
Processing is implementing a method that both filters and counts. Also,
it pegs my CPU while it's running.
Below are the filtering and filtering/counting methods.
Thanks.
mp
public class parseLines
{
static string fileIdentifiers =
@"\.gif\s|\.js\ s|\.png\s|\.css \s|\.jpg\s";
static Regex reAll = new Regex(fileIdent ifiers);
string fileName;
public parseLines(stri ng fileName)
{
this.fileName = fileName;
}
public void getLines()
{
// print nonmatching lines to stdout
}
public Hashtable countMatches()
{
// count individual matches
}
public void filterLines()
{
string newFileName = fileName + ".modified.log" ;
StreamReader sr = new StreamReader(fi leName);
StreamWriter wr = new StreamWriter(ne wFileName);
string nextLine = sr.ReadLine();
while (nextLine != null)
{
Match myMatch = reAll.Match(nex tLine);
if (!myMatch.Succe ss)
{
wr.WriteLine(ne xtLine);
}
nextLine = sr.ReadLine();
}
sr.Close();
wr.Close();
}
public Hashtable filterAndCountL ines()
{
string newFileName = fileName + ".modified.log" ;
Hashtable ht = new Hashtable();
char[] sep = {'|'};
string[] newTypeArray = fileIdentifiers .Split(sep);
Regex[] newMatchArray = new Regex[5];
for (int i = 0; i < newTypeArray.Le ngth; i++)
{
Regex item = new Regex(newTypeAr ray[i]);
newMatchArray[i] = item;
}
foreach (string item in newTypeArray)
{
ht.Add(item,0);
}
ht.Add("total Match",0);
ht.Add("total No Match",0);
StreamReader sr = new StreamReader(fi leName);
StreamWriter wr = new StreamWriter(ne wFileName);
string nextLine = sr.ReadLine();
while (nextLine != null)
{
Match myMatch = reAll.Match(nex tLine);
if (!myMatch.Succe ss)
{
wr.WriteLine(ne xtLine);
ht["total No Match"] =
(int)ht["total No Match"] + 1;
}
else
{
foreach (Regex itemRegex in
newMatchArray)
{
Match arrMatch =
itemRegex.Match (nextLine);
if (arrMatch.Succe ss)
{
ht[itemRegex.ToStr ing()] =
(int)ht[itemRegex.ToStr ing()]
+ 1;
break;
}
}
ht["total Match"] = (int)ht["total
Match"] + 1;
}
nextLine = sr.ReadLine();
}
sr.Close();
wr.Close();
return ht;
}
}
class MainClass
{
public static void Main(string[] args)
{
Hashtable count;
IDictionaryEnum erator countEnumerator ;
parseLines pl = new parseLines(args[0]);
count = pl.filterAndCou ntLines();
countEnumerator = count.GetEnumer ator();
while (countEnumerato r.MoveNext())
{
Console.WriteLi ne(countEnumera tor.Key.ToStrin g() + " : " +
countEnumerator .Value.ToString ());
}
Console.WriteLi ne("finished") ;
}
}
}
--
Michael Powe mi*****@trollop e.org Waterbury CT
ENOSIG: signature file is empty