"Andrew Dixon" <da******@NOREPLY.yahoo.co.uk> wrote in message news:<mc***************@news-binary.blueyonder.co.uk>...
Hi.
Sorry, I'm not really understanding what you mean, could you show me an
example or re-write my expression.
Here is a simple example. Hope this helps.
<code>
import java.nio.*;
import java.nio.channels.*;
import java.io.*;
import java.util.regex.*;
public class TagBodyExtractor{
public static void main(String[] args){
String tagId, closingTag, inFileName;
boolean bodyOnly;
if (args.length < 1){
System.err.println("USAGE:");
System.err.println("java TagBodyExtractor filename");
System.err.println("or,");
System.err.println("java TagBodyExtractor tagtext filename");
System.exit(1);
}
if (args.length == 2){
tagId = args[0];
inFileName = args[1];
}
else{
tagId = "script"; // do to-lower on tags before using this prog
inFileName = args[0];
}
closingTag = "</" + tagId + ">";
bodyOnly = false; //output both tags and their bodies
try{
FileInputStream fis = new FileInputStream(inFileName);
FileChannel fc = fis.getChannel();
MappedByteBuffer mbf
= fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
byte[] barray = new byte[(int)(fc.size())];
mbf.get(barray);
String str = new String(barray, "US-ASCII");
//or //String str = new String(barray"); //use default
String match1, match2, match3;
//here we assume syntax-error-free html file!
String regex = "(<" + tagId + "[^>]*>)" //1st capturing group
+ "((?:\"[^\"]*\"|\'[^\']*\'|[^\"\'])*?(?="
+ closingTag + "))" //2nd capturing group
+ "(" + closingTag + ")"; //3rd capturing group
Pattern pat = Pattern.compile(regex, Pattern.DOTALL | Pattern.MULTILINE);
boolean hasMore = false;
Matcher mat = pat.matcher(str);
while (hasMore = mat.find()){
match1 = mat.group(1);
match2 = mat.group(2);
match3 = mat.group(3);
if (bodyOnly){
System.out.println(match2);
}
else{
System.out.println(match1 + match2 + match3);
}
}
fc.close();
fis.close();
}
catch(Exception e){
e.printStackTrace();
}
}
}
</code>