469,323 Members | 1,427 Online
Bytes | Developer Community
New Post

Home Posts Topics Members FAQ

Post your question to a community of 469,323 developers. It's quick & easy.

Languid - Language Guesser

2
Hello,
I am trying to run Languid language guesser - see http://languid.cantbedone.org/

I am not a perl programmer, and I have no idea how to get this script running. I am running perl scripts on my computer but Im not sure how to run this one. Part of the main script is called Guess.pm and its code is shown below -

Can someone please explain how I run this script.
Thanks, Bob


package Language::Guess;

use strict;
use warnings;
require 5.008;
use Data::Dumper qw/Dumper/;
use Unicode::UCD 'charinfo';
use utf8;
use Encode qw/is_utf8 _utf8_on/;
use Unicode::Normalize; # dont trust your input!
use File::Spec::Functions;
use Carp;

our $VERSION = '0.03';
our $MIN_LENGTH = 20;

=head1 NAME

Language::Guess

=head1 ABSTRACT

A statistical language guesser

=head1 SYNOPSIS

use Language::Guess;

my $guesser = Language::Guess->new( modeldir => '~/train' );

while (my $line = <> ) {
my $lang = $guesser->simple_guess($line);
print "Language was $lang\n\n";
}

=cut

#sub init {

our $MAX = 300;

our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon indonesian
hawaiian welsh latin swahili basque/;
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene Turkish Hungarian
Azeri Lithuanian Estonian/;
our @ACCENTED_LATIN = (qw/Albanian catalan Spanish French German Dutch Italian Danish
Icelandic Norwegian Swedish Finnish Latvian Portuguese
/, @EXOTIC_LATIN);

our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN );

our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek Mongolian
Serbian Macedonian Bulgarian Kyrgyz/;
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/;
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi Nepali
Sanskrit/;

our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati Oriya
Tamil Telugu Kannada Malayalam Sinhala Thai Lao Tibetan
Burmese Georgian Mongolian/;

#}

binmode STDOUT, 'utf8';
binmode STDERR, 'utf8';

sub new {
my ( $class, %params ) = @_;
croak "Must provide a model directory" unless exists $params{modeldir};
croak "Model directory does not exist" unless -d $params{modeldir};

my $self = bless { %params }, $class;
return $self;
}


sub guess {
my ( $self,$string ) = @_;
#warn $string;
_utf8_on($string);
$self->load_models() unless defined $self->{models};
my @runs = find_runs( $string );
#warn "Found ", scalar @runs, " runs\n";
#warn $runs[0][1];
my @langs;
my %scripts;
foreach my $run ( @runs ) {
$scripts{$run->[1]}++;
}

# returns arrayref of hashes in the form
# [ { name => NAME, score => SCORE }]

return $self->identify( $string, %scripts );

}

sub simple_guess {
my ( $self, $string ) = @_;
my $got = $self->guess($string);
#warn Dumper($got);
return $got->[0]{name};
}


sub load_models {
my ( $self ) = @_;

opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
my %models;
while ( my $f = readdir $dh ) {
next unless $f =~ /\.train$/;
my ( $name ) = $f =~ m|(.*)\.|;
my $path = catfile( $self->{modeldir}, $f );
open my $fh, "<:utf8", $path or die "Failed to open file: $!";
my %model;
while ( my $line = <$fh> ) {
chomp $line;
my ( $k, $v) = $line =~ m|(.{3})\s+(.*)|;
next unless defined $k;
#warn "'$k' $v\n";
$model{$k} = $v;
}
$models{$name} = \%model;
}
$self->{models} = \%models;
}

=item find_runs STRING

This is unused for the moment; the subroutine finds runs of scripts in a string
and returns an array of them. Upgrades basic latin pieces to accented and
exotic latin if characters from those script blocks are found. This avoids
languages like Polish from being split into a thousand runs of two and three
basic latin characters, interspersed with accented.

=cut

sub find_runs {
my ( $raw ) = @_;

my @chars = split m//, $raw;

my $prev = '';
my @c;
my @runs;
my @run_types;
my $current_run = -1;

foreach my $c ( @chars ) {
my $is_alph = $c =~ /[[:alpha:]]/o;
my $inf = get_charinfo( $c );
if ( $is_alph and !( $inf->{block} eq $prev) ) {
$prev = $inf->{block};
@c = ();
$current_run++;
$run_types[$current_run] = $prev;
}
push @c, $c;
push @{ $runs[$current_run] }, $c if $current_run > -1;
}

my ( $newruns, $newtypes ) = reconcile_latin( \@runs, \@run_types );


my $counter =0;
my @result;
foreach my $r ( @$newruns ) {
push @result, [ $r, $newtypes->[$counter]];
$counter++;
}
return @result;
}

{ my %cache;
sub get_charinfo {
my ( $char ) = @_;
return $cache{$char} if exists $cache{$char};
my $inf = charinfo( ord( $char ));
$cache{$char} = $inf;
}
}


=item reconcile_latin STRING, ARREF

internal method, attempts to pick which level of weird diacriticalness
a latin string has. Consolidates runs into one string.

=cut

sub reconcile_latin {
my ( $runs, $types ) = @_;
my @types = @$types;
my (@new_runs, @new_types);
my $last_type = '';

my $upgrade;
$upgrade = 'Accented Latin' if has_supplemental_latin( @$types );
$upgrade = 'Exotic Latin' if has_extended_latin( @$types );
$upgrade = 'Superfreak Latin' if has_latin_extended_additional( @$types );

return ( $runs, $types ) unless $upgrade;
my $run_count = -1;
foreach my $r ( @$runs ) {
my $type = shift @types;
$type = $upgrade if $type =~ /Latin/;
$run_count++ unless $type eq $last_type;

push @{$new_runs[$run_count]}, @$r;
$new_types[$run_count] = $type;
$last_type = $type;
}
return ( \@new_runs, \@new_types );
}



sub has_extended_latin {
my ( @types ) = @_;
return scalar grep { /Latin Extended-A/ } @types;
}

sub has_supplemental_latin {
my ( @types ) = @_;
return scalar grep { /Latin-1 Supplement/ } @types;
}

sub has_latin_extended_additional {
my ( @types ) = @_;
return scalar grep { /Latin Extended Additional/ } @types;
}


sub identify {
my ( $self, $sample, %scripts ) = @_;
#warn "Incoming scripts are ", join ", ", keys %scripts;

return [{ name => 'too short', score => 1 }] if length($sample) < 3;
return [{ name => "Swedish Chef", score => 1}] if $sample =~ /bork bork bork/i;
return [{ name => "Pacman", score => 1}] if $sample =~ /waka waka waka/i;



# Check for Korean
if ( exists $scripts{'Hangul Syllables'} or
exists $scripts{'Hangul Jamo'} or
exists $scripts{'Hangul Compatibility Jamo'} or
exists $scripts{'Hangul'}) {
return [{ name =>'korean', score => 1 }];
}
if ( exists $scripts{'Greek and Coptic'} ){

return [{ name =>'greek', score => 1 }];
}

if ( exists $scripts{'Katakana'} or
exists $scripts{'Hiragana'} or
exists $scripts{'Katakana Phonetic Extensions'}) {
return [{ name =>'japanese', score => 1 }];
}


if ( exists $scripts{'CJK Unified Ideographs'} or
exists $scripts{'Bopomofo'} or
exists $scripts{'Bopomofo Extended'} or
exists $scripts{'KangXi Radicals'} or
exists $scripts{'Arabic Presentation Forms-A'} ) {
return [{ name => 'chinese', score => 1 }];
}

if ( exists $scripts{'Cyrillic'} ) {
return $self->check( $sample, @CYRILLIC );
}


if ( exists $scripts{'Arabic'} or
exists $scripts{'Arabic Presentation Forms-A'} or
exists $scripts{'Arabic Presentation Forms-B'}
){
return $self->check( $sample, @ARABIC );
}

if ( exists $scripts{'Devanagari'} ) {
return $self->check( $sample, @DEVANAGARI );
}


# Try languages with unique scripts
foreach my $s ( @SINGLETONS ) {
return [{ name => lc($s), score => 1 }] if exists $scripts{$s};
}

if ( exists $scripts{'Superfreak Latin'} ) {
return [{ name => 'vietnamese', score => 1 }];
}

if ( exists $scripts{'Exotic Latin'} ) {
return $self->check( $sample, @EXOTIC_LATIN );
}

if ( exists $scripts{'Accented Latin'} ) {
return $self->check( $sample, @ACCENTED_LATIN );
}


if ( exists $scripts{'Basic Latin'} ) {
return $self->check( $sample, @ALL_LATIN );
}

return [{ name => "unknown script: '".(join ", ", keys %scripts)."'", score => 1}];

}


sub check {
my ( $self, $raw, @langs ) = @_;
#return join ' ', @langs
#warn "Checking sample $sample", "\n";
#my $num_tri = length( $sample ) / 3;

my $sample = __normalize($raw);
return { name => 'too short', score => 1 } if length($sample) < $MIN_LENGTH;
my $mod = __make_model( $sample );
my $num_tri = scalar keys %{$mod};
my %scores;
foreach my $key ( @langs ) {
my $l = lc( $key );
#warn "Checking $l\n";
next unless exists $self->{models}{$l};
my $score = __distance( $mod, $self->{models}{$l} );
$scores{$l} = $score;
}
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores;
my @out;
$num_tri ||=1;
foreach my $s ( @sorted ) {
my $norm = $scores{$s}/$num_tri;
push @out, { name => $s , score => int($norm) };
}
return [splice ( @out, 0, 4 )];

if ( @sorted ) {
return splice ( @sorted, 0, 4 );
my @all;
my $firstscore = $scores{$sorted[0]};
while ( my $next = shift @sorted ) {
last unless $scores{$next} == $firstscore;
push @all, $next;
}
return join ',', @all;
}
return { name => 'unknown'. ( join ' ', @langs), score =>1 };
}


sub __distance {
my ( $m1, $m2 ) = @_;
my $dist =0;
foreach my $k ( keys %{$m1} ) {
$dist +=
( exists $m2->{$k} ?
abs( $m2->{$k} - $m1->{$k} ) :
$MAX
);
}
return $dist;
}
Sep 13 '06 #1
0 1859

Post your reply

Sign in to post your reply or Sign up for a free account.

Similar topics

reply views Thread by Thiva Charanasri | last post: by
22 posts views Thread by Michael Nahas | last post: by
134 posts views Thread by evolnet.regular | last post: by
reply views Thread by Thiva Charanasri | last post: by
2 posts views Thread by rjack | last post: by
35 posts views Thread by John Coleman | last post: by
669 posts views Thread by Xah Lee | last post: by
1 post views Thread by CARIGAR | last post: by
reply views Thread by zhoujie | last post: by
reply views Thread by suresh191 | last post: by
reply views Thread by Gurmeet2796 | last post: by
reply views Thread by mdpf | last post: by
reply views Thread by harlem98 | last post: by
By using this site, you agree to our Privacy Policy and Terms of Use.