Hello,
I have written a small app to parse web log files and extract certain
lines to another file. There is also functionality to count all the
items that are being filtered out.
I wrote this in c# instead of in perl because the log files are 3-4GB
and I want faster processing than perl would typically provide. And,
I'm learning c#.
There are two issues I would like to address: improve the speed of the
file i/o and control the processing. Right now, this app takes about 20
min to process a 3GB file on a laptop with a 2Ghz proc and 2GB RAM.
Processing is implementing a method that both filters and counts. Also,
it pegs my CPU while it's running.
Below are the filtering and filtering/counting methods.
Thanks.
mp
public class parseLines
{
static string fileIdentifiers =
@"\.gif\s|\.js\s|\.png\s|\.css\s|\.jpg\s";
static Regex reAll = new Regex(fileIdentifiers);
string fileName;
public parseLines(string fileName)
{
this.fileName = fileName;
}
public void getLines()
{
// print nonmatching lines to stdout
}
public Hashtable countMatches()
{
// count individual matches
}
public void filterLines()
{
string newFileName = fileName + ".modified.log";
StreamReader sr = new StreamReader(fileName);
StreamWriter wr = new StreamWriter(newFileName);
string nextLine = sr.ReadLine();
while (nextLine != null)
{
Match myMatch = reAll.Match(nextLine);
if (!myMatch.Success)
{
wr.WriteLine(nextLine);
}
nextLine = sr.ReadLine();
}
sr.Close();
wr.Close();
}
public Hashtable filterAndCountLines()
{
string newFileName = fileName + ".modified.log";
Hashtable ht = new Hashtable();
char[] sep = {'|'};
string[] newTypeArray = fileIdentifiers.Split(sep);
Regex[] newMatchArray = new Regex[5];
for (int i = 0; i < newTypeArray.Length; i++)
{
Regex item = new Regex(newTypeArray[i]);
newMatchArray[i] = item;
}
foreach (string item in newTypeArray)
{
ht.Add(item,0);
}
ht.Add("total Match",0);
ht.Add("total No Match",0);
StreamReader sr = new StreamReader(fileName);
StreamWriter wr = new StreamWriter(newFileName);
string nextLine = sr.ReadLine();
while (nextLine != null)
{
Match myMatch = reAll.Match(nextLine);
if (!myMatch.Success)
{
wr.WriteLine(nextLine);
ht["total No Match"] =
(int)ht["total No Match"] + 1;
}
else
{
foreach (Regex itemRegex in
newMatchArray)
{
Match arrMatch =
itemRegex.Match(nextLine);
if (arrMatch.Success)
{
ht[itemRegex.ToString()] =
(int)ht[itemRegex.ToString()]
+ 1;
break;
}
}
ht["total Match"] = (int)ht["total
Match"] + 1;
}
nextLine = sr.ReadLine();
}
sr.Close();
wr.Close();
return ht;
}
}
class MainClass
{
public static void Main(string[] args)
{
Hashtable count;
IDictionaryEnumerator countEnumerator;
parseLines pl = new parseLines(args[0]);
count = pl.filterAndCountLines();
countEnumerator = count.GetEnumerator();
while (countEnumerator.MoveNext())
{
Console.WriteLine(countEnumerator.Key.ToString() + " : " +
countEnumerator.Value.ToString());
}
Console.WriteLine("finished");
}
}
}
--
Michael Powe mi*****@trollope.org Waterbury CT
ENOSIG: signature file is empty