José Joye <jo*******@KILL THESPAMSbluewin .ch> wrote:
In fact, my strings are OCR-B lines read from Bank/Post Slips.
Each line should contains at most 80 chars. I have removed the Heading and
leading spaces with the Trim()
method.
So this can be some samples:
"0100 000187004>221 74101 02080003 95200208060+ 010184 473>"
"01 00000179008>0 00050175 7500100007054 24008+ 0103 97904>"
"01 0000006630 3>104922 351100079647820 000008+ 010194507>"
"0 00000000000 000111033122108 + 077782103 >"
"5 00002700>"
"0100000241504> 113730619000003 472360720026+ 010231043>"
Righto.
Running the code at the bottom, here are the results I got:
Benchmarking type MultiSpace
Run #1
RegexReplace 00:00:51.103483 2
RegexReplaceWit hTest 00:00:45.944313 6
CompiledRegexRe placeWithTest 00:00:15.342060 8
StringReplace 00:00:06.068726 4
StringBuilderSi ngleChar 00:00:03.625212 8
StringBuilderBl ock 00:00:02.183139 2
Run #2
RegexReplace 00:00:51.033382 4
RegexReplaceWit hTest 00:00:45.926038 4
CompiledRegexRe placeWithTest 00:00:15.031614 4
StringReplace 00:00:06.038683 2
StringBuilderSi ngleChar 00:00:03.665270 4
StringBuilderBl ock 00:00:02.133067 2
It looks like the StringBuilderBl ock method is the best by a reasonably
significant margin. The code for that on its own would be:
public static void FlattenSpaces (string x)
{
if (x.IndexOf (" ")==-1)
return x;
StringBuilder builder = new StringBuilder(x .Length);
int start=0;
while (true)
{
int nextDoubleSpace = x.IndexOf (" ", start);
if (nextDoubleSpac e==-1)
break;
builder.Append (x, start, nextDoubleSpace +1-start);
start = nextDoubleSpace +2;
while (start < x.Length && x[start]==' ')
start++;
}
builder.Append (x, start, x.Length-start);
return builder.ToStrin g();
}
Benchmark code (run with -runtwice on my box):
// See
http://www.pobox.com/~skeet/csharp/benchmark.html
// for how to run this code.
using System;
using System.Text;
using System.Text.Reg ularExpressions ;
public class MultiSpace
{
static readonly string[] TestCases =
{
"0100 000187004>221 74101 02080003 95200208060+ "+
" 010184 473>",
"01 00000179008>0 00050175 7500100007054 24008+ "+
"0103 97904>",
"01 0000006630 3>104922 351100079647820 000008+ 010194507>",
"0 00000000000 000111033122108 + 077782103 >",
"5 00002700>",
"0100000241504> 113730619000003 472360720026+ 010231043>",
};
static long check;
static int iterations = 100000;
public static void Init(string[] args)
{
if (args.Length != 0)
iterations = Int32.Parse(arg s[0]);
}
public static void Reset()
{
check=0;
}
public static void Check()
{
if (check != 279*iterations)
throw new Exception ("Invalid check total: "+check);
}
[Benchmark]
public static void RegexReplace()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string s in TestCases)
{
string x=s;
x = Regex.Replace (x, " +", " ");
total+=x.Length ;
}
}
check=total;
}
[Benchmark]
public static void RegexReplaceWit hTest()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string s in TestCases)
{
string x=s;
if (x.IndexOf(" ")!=-1)
x = Regex.Replace (x, " +", " ");
total+=x.Length ;
}
}
check=total;
}
static Regex compiledRegex = new Regex (" +",
RegexOptions.Co mpiled);
[Benchmark]
public static void CompiledRegexRe placeWithTest()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string s in TestCases)
{
string x=s;
if (x.IndexOf(" ")!=-1)
x = compiledRegex.R eplace (x, " ");
total+=x.Length ;
}
}
check=total;
}
[Benchmark]
public static void StringReplace()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string s in TestCases)
{
string x=s;
while (x.IndexOf(" ")!=-1)
x=x.Replace(" ", " ");
total+=x.Length ;
}
}
check=total;
}
[Benchmark]
public static void StringBuilderSi ngleChar()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string s in TestCases)
{
if (s.IndexOf (" ")==-1)
{
total+=s.Length ;
continue;
}
StringBuilder builder = new StringBuilder(s .Length);
bool inSpace=false;
foreach (char c in s)
{
if (c==' ')
{
if (!inSpace)
builder.Append( c);
inSpace=true;
}
else
{
builder.Append( c);
inSpace=false;
}
}
total+=builder. ToString().Leng th;
}
}
check=total;
}
[Benchmark]
public static void StringBuilderBl ock()
{
long total=0;
for (int i = iterations; i>0; i--)
{
foreach (string x in TestCases)
{
if (x.IndexOf (" ")==-1)
{
total+=x.Length ;
continue;
}
StringBuilder builder = new StringBuilder(x .Length);
int start=0;
while (true)
{
int nextDoubleSpace = x.IndexOf (" ", start);
if (nextDoubleSpac e==-1)
break;
builder.Append (x, start, nextDoubleSpace +1-start);
start = nextDoubleSpace +2;
while (start < x.Length && x[start]==' ')
start++;
}
builder.Append (x, start, x.Length-start);
total+=builder. ToString().Leng th;
}
}
check=total;
}
}
--
Jon Skeet - <sk***@pobox.co m>
http://www.pobox.com/~skeet/
If replying to the group, please do not mail me too