473,718 Members | 2,191 Online
Bytes | Software Development & Data Engineering Community
+ Post

Home Posts Topics Members FAQ

Languid - Language Guesser

2 New Member
Hello,
I am trying to run Languid language guesser - see http://languid.cantbedone.org/

I am not a perl programmer, and I have no idea how to get this script running. I am running perl scripts on my computer but Im not sure how to run this one. Part of the main script is called Guess.pm and its code is shown below -

Can someone please explain how I run this script.
Thanks, Bob


package Language::Guess ;

use strict;
use warnings;
require 5.008;
use Data::Dumper qw/Dumper/;
use Unicode::UCD 'charinfo';
use utf8;
use Encode qw/is_utf8 _utf8_on/;
use Unicode::Normal ize; # don’t trust your input!
use File::Spec::Fun ctions;
use Carp;

our $VERSION = '0.03';
our $MIN_LENGTH = 20;

=head1 NAME

Language::Guess

=head1 ABSTRACT

A statistical language guesser

=head1 SYNOPSIS

use Language::Guess ;

my $guesser = Language::Guess->new( modeldir => '~/train' );

while (my $line = <> ) {
my $lang = $guesser->simple_guess($ line);
print "Language was $lang\n\n";
}

=cut

#sub init {

our $MAX = 300;

our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon indonesian
hawaiian welsh latin swahili basque/;
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene Turkish Hungarian
Azeri Lithuanian Estonian/;
our @ACCENTED_LATIN = (qw/Albanian catalan Spanish French German Dutch Italian Danish
Icelandic Norwegian Swedish Finnish Latvian Portuguese
/, @EXOTIC_LATIN);

our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN );

our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek Mongolian
Serbian Macedonian Bulgarian Kyrgyz/;
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/;
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi Nepali
Sanskrit/;

our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati Oriya
Tamil Telugu Kannada Malayalam Sinhala Thai Lao Tibetan
Burmese Georgian Mongolian/;

#}

binmode STDOUT, 'utf8';
binmode STDERR, 'utf8';

sub new {
my ( $class, %params ) = @_;
croak "Must provide a model directory" unless exists $params{modeldi r};
croak "Model directory does not exist" unless -d $params{modeldi r};

my $self = bless { %params }, $class;
return $self;
}


sub guess {
my ( $self,$string ) = @_;
#warn $string;
_utf8_on($strin g);
$self->load_models( ) unless defined $self->{models};
my @runs = find_runs( $string );
#warn "Found ", scalar @runs, " runs\n";
#warn $runs[0][1];
my @langs;
my %scripts;
foreach my $run ( @runs ) {
$scripts{$run->[1]}++;
}

# returns arrayref of hashes in the form
# [ { name => NAME, score => SCORE }]

return $self->identify( $string, %scripts );

}

sub simple_guess {
my ( $self, $string ) = @_;
my $got = $self->guess($string) ;
#warn Dumper($got);
return $got->[0]{name};
}


sub load_models {
my ( $self ) = @_;

opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
my %models;
while ( my $f = readdir $dh ) {
next unless $f =~ /\.train$/;
my ( $name ) = $f =~ m|(.*)\.|;
my $path = catfile( $self->{modeldir}, $f );
open my $fh, "<:utf8", $path or die "Failed to open file: $!";
my %model;
while ( my $line = <$fh> ) {
chomp $line;
my ( $k, $v) = $line =~ m|(.{3})\s+(.*) |;
next unless defined $k;
#warn "'$k' $v\n";
$model{$k} = $v;
}
$models{$name} = \%model;
}
$self->{models} = \%models;
}

=item find_runs STRING

This is unused for the moment; the subroutine finds runs of scripts in a string
and returns an array of them. Upgrades basic latin pieces to accented and
exotic latin if characters from those script blocks are found. This avoids
languages like Polish from being split into a thousand runs of two and three
basic latin characters, interspersed with accented.

=cut

sub find_runs {
my ( $raw ) = @_;

my @chars = split m//, $raw;

my $prev = '';
my @c;
my @runs;
my @run_types;
my $current_run = -1;

foreach my $c ( @chars ) {
my $is_alph = $c =~ /[[:alpha:]]/o;
my $inf = get_charinfo( $c );
if ( $is_alph and !( $inf->{block} eq $prev) ) {
$prev = $inf->{block};
@c = ();
$current_run++;
$run_types[$current_run] = $prev;
}
push @c, $c;
push @{ $runs[$current_run] }, $c if $current_run > -1;
}

my ( $newruns, $newtypes ) = reconcile_latin ( \@runs, \@run_types );


my $counter =0;
my @result;
foreach my $r ( @$newruns ) {
push @result, [ $r, $newtypes->[$counter]];
$counter++;
}
return @result;
}

{ my %cache;
sub get_charinfo {
my ( $char ) = @_;
return $cache{$char} if exists $cache{$char};
my $inf = charinfo( ord( $char ));
$cache{$char} = $inf;
}
}


=item reconcile_latin STRING, ARREF

internal method, attempts to pick which level of weird diacriticalness
a latin string has. Consolidates runs into one string.

=cut

sub reconcile_latin {
my ( $runs, $types ) = @_;
my @types = @$types;
my (@new_runs, @new_types);
my $last_type = '';

my $upgrade;
$upgrade = 'Accented Latin' if has_supplementa l_latin( @$types );
$upgrade = 'Exotic Latin' if has_extended_la tin( @$types );
$upgrade = 'Superfreak Latin' if has_latin_exten ded_additional( @$types );

return ( $runs, $types ) unless $upgrade;
my $run_count = -1;
foreach my $r ( @$runs ) {
my $type = shift @types;
$type = $upgrade if $type =~ /Latin/;
$run_count++ unless $type eq $last_type;

push @{$new_runs[$run_count]}, @$r;
$new_types[$run_count] = $type;
$last_type = $type;
}
return ( \@new_runs, \@new_types );
}



sub has_extended_la tin {
my ( @types ) = @_;
return scalar grep { /Latin Extended-A/ } @types;
}

sub has_supplementa l_latin {
my ( @types ) = @_;
return scalar grep { /Latin-1 Supplement/ } @types;
}

sub has_latin_exten ded_additional {
my ( @types ) = @_;
return scalar grep { /Latin Extended Additional/ } @types;
}


sub identify {
my ( $self, $sample, %scripts ) = @_;
#warn "Incoming scripts are ", join ", ", keys %scripts;

return [{ name => 'too short', score => 1 }] if length($sample) < 3;
return [{ name => "Swedish Chef", score => 1}] if $sample =~ /bork bork bork/i;
return [{ name => "Pacman", score => 1}] if $sample =~ /waka waka waka/i;



# Check for Korean
if ( exists $scripts{'Hangu l Syllables'} or
exists $scripts{'Hangu l Jamo'} or
exists $scripts{'Hangu l Compatibility Jamo'} or
exists $scripts{'Hangu l'}) {
return [{ name =>'korean', score => 1 }];
}
if ( exists $scripts{'Greek and Coptic'} ){

return [{ name =>'greek', score => 1 }];
}

if ( exists $scripts{'Katak ana'} or
exists $scripts{'Hirag ana'} or
exists $scripts{'Katak ana Phonetic Extensions'}) {
return [{ name =>'japanese', score => 1 }];
}


if ( exists $scripts{'CJK Unified Ideographs'} or
exists $scripts{'Bopom ofo'} or
exists $scripts{'Bopom ofo Extended'} or
exists $scripts{'KangX i Radicals'} or
exists $scripts{'Arabi c Presentation Forms-A'} ) {
return [{ name => 'chinese', score => 1 }];
}

if ( exists $scripts{'Cyril lic'} ) {
return $self->check( $sample, @CYRILLIC );
}


if ( exists $scripts{'Arabi c'} or
exists $scripts{'Arabi c Presentation Forms-A'} or
exists $scripts{'Arabi c Presentation Forms-B'}
){
return $self->check( $sample, @ARABIC );
}

if ( exists $scripts{'Devan agari'} ) {
return $self->check( $sample, @DEVANAGARI );
}


# Try languages with unique scripts
foreach my $s ( @SINGLETONS ) {
return [{ name => lc($s), score => 1 }] if exists $scripts{$s};
}

if ( exists $scripts{'Super freak Latin'} ) {
return [{ name => 'vietnamese', score => 1 }];
}

if ( exists $scripts{'Exoti c Latin'} ) {
return $self->check( $sample, @EXOTIC_LATIN );
}

if ( exists $scripts{'Accen ted Latin'} ) {
return $self->check( $sample, @ACCENTED_LATIN );
}


if ( exists $scripts{'Basic Latin'} ) {
return $self->check( $sample, @ALL_LATIN );
}

return [{ name => "unknown script: '".(join ", ", keys %scripts)."'", score => 1}];

}


sub check {
my ( $self, $raw, @langs ) = @_;
#return join ' ', @langs
#warn "Checking sample $sample", "\n";
#my $num_tri = length( $sample ) / 3;

my $sample = __normalize($ra w);
return { name => 'too short', score => 1 } if length($sample) < $MIN_LENGTH;
my $mod = __make_model( $sample );
my $num_tri = scalar keys %{$mod};
my %scores;
foreach my $key ( @langs ) {
my $l = lc( $key );
#warn "Checking $l\n";
next unless exists $self->{models}{$l} ;
my $score = __distance( $mod, $self->{models}{$l} );
$scores{$l} = $score;
}
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores;
my @out;
$num_tri ||=1;
foreach my $s ( @sorted ) {
my $norm = $scores{$s}/$num_tri;
push @out, { name => $s , score => int($norm) };
}
return [splice ( @out, 0, 4 )];

if ( @sorted ) {
return splice ( @sorted, 0, 4 );
my @all;
my $firstscore = $scores{$sorted[0]};
while ( my $next = shift @sorted ) {
last unless $scores{$next} == $firstscore;
push @all, $next;
}
return join ',', @all;
}
return { name => 'unknown'. ( join ' ', @langs), score =>1 };
}


sub __distance {
my ( $m1, $m2 ) = @_;
my $dist =0;
foreach my $k ( keys %{$m1} ) {
$dist +=
( exists $m2->{$k} ?
abs( $m2->{$k} - $m1->{$k} ) :
$MAX
);
}
return $dist;
}
Sep 13 '06 #1
0 2009

Sign in to post your reply or Sign up for a free account.

Similar topics

0
1598
by: Thiva Charanasri | last post by:
http://www.poweroflanguage.org Track: Computer Language 1st World Congress on the Power of Language: Theory, Practice and Performance Date: March 6 - 10, 2006 Bangkok, Thailand On this very auspicious occasion, Thai people will join hands with
22
2864
by: Michael Nahas | last post by:
Antti & all interested, The draft description of my language to replace C is available at: http://nahas.is-a-geek.com/~mike/MyC.pdf I am a long time C programmer (I read the old testament in 1987) and I've tried to keep the spirit of C and make as few changes as possible. I was mostly driven by the bloat of C++ and, now, C99. I was also
134
8021
by: evolnet.regular | last post by:
I've been utilising C for lots of small and a few medium-sized personal projects over the course of the past decade, and I've realised lately just how little progress it's made since then. I've increasingly been using scripting languages (especially Python and Bourne shell) which offer the same speed and yet are far more simple and safe to use. I can no longer understand why anyone would willingly use C to program anything but the lowest...
0
1601
by: Thiva Charanasri | last post by:
http://www.poweroflanguage.org Track: Computer Language 1st World Congress on the Power of Language: Theory, Practice and Performance Date: March 6 - 10, 2006 Bangkok, Thailand On this very auspicious occasion, Thai people will join hands with
2
1652
by: rjack | last post by:
I'm using VS 2005 Beta 2. In VS 2003, the Page directive in an aspx page has Language and CodeBehind attributes. You can have the language be different than the code behind file language. For instance, you can have Language="vb" CodeBehind="classA.cs" In VS 2005, the CodeBehind attribute seems to have been replaced with
35
2379
by: John Coleman | last post by:
Greetings, I have a rough classification of languages into 2 classes: Zen languages and tool languages. A tool language is a language that is, well, a *tool* for programming a computer. C is the prototypical tool language. Most languages in the Algol family are tool languages. Visual Basic and Java are also tool languages. On the other hand, a Zen language is a language which is purported to transform your way of thinking about...
23
3640
by: Xah Lee | last post by:
The Concepts and Confusions of Pre-fix, In-fix, Post-fix and Fully Functional Notations Xah Lee, 2006-03-15 Let me summarize: The LISP notation, is a functional notation, and is not a so-called pre-fix notation or algebraic notation. Algebraic notations have the concept of operators, meaning, symbols placed around arguments. In algebraic in-fix notation, different
669
26019
by: Xah Lee | last post by:
in March, i posted a essay “What is Expressiveness in a Computer Language”, archived at: http://xahlee.org/perl-python/what_is_expresiveness.html I was informed then that there is a academic paper written on this subject. On the Expressive Power of Programming Languages, by Matthias Felleisen, 1990. http://www.ccs.neu.edu/home/cobbe/pl-seminar-jr/notes/2003-sep-26/expressive-slides.pdf
10
10788
by: Immortalist | last post by:
Various aquisition devices that guide learning along particular pathways towards human biases. And as E.O. Wilson might say mental development appears to be genetically constrained. (1) Language Aquisition Device (2) Color Aqusition Device (3) Sound Aquistion Device (4) Smell Aquisition Device (5) Touch Aquisition Device (6) Art Aquisition Device
0
8724
by: Hystou | last post by:
Most computers default to English, but sometimes we require a different language, especially when relocating. Forgot to request a specific language before your computer shipped? No problem! You can effortlessly switch the default language on Windows 10 without reinstalling. I'll walk you through it. First, let's disable language synchronization. With a Microsoft account, language settings sync across devices. To prevent any complications,...
0
9207
jinu1996
by: jinu1996 | last post by:
In today's digital age, having a compelling online presence is paramount for businesses aiming to thrive in a competitive landscape. At the heart of this digital strategy lies an intricately woven tapestry of website design and digital marketing. It's not merely about having a website; it's about crafting an immersive digital experience that captivates audiences and drives business growth. The Art of Business Website Design Your website is...
1
9121
by: Hystou | last post by:
Overview: Windows 11 and 10 have less user interface control over operating system update behaviour than previous versions of Windows. In Windows 11 and 10, there is no way to turn off the Windows Update option using the Control Panel or Settings app; it automatically checks for updates and installs any it finds, whether you like it or not. For most users, this new feature is actually very convenient. If you want to control the update process,...
0
9053
tracyyun
by: tracyyun | last post by:
Dear forum friends, With the development of smart home technology, a variety of wireless communication protocols have appeared on the market, such as Zigbee, Z-Wave, Wi-Fi, Bluetooth, etc. Each protocol has its own unique characteristics and advantages, but as a user who is planning to build a smart home system, I am a bit confused by the choice of these technologies. I'm particularly interested in Zigbee because I've heard it does some...
0
5971
by: conductexam | last post by:
I have .net C# application in which I am extracting data from word file and save it in database particularly. To store word all data as it is I am converting the whole word file firstly in HTML and then checking html paragraph one by one. At the time of converting from word file to html my equations which are in the word document file was convert into image. Globals.ThisAddIn.Application.ActiveDocument.Select();...
0
4481
by: TSSRALBI | last post by:
Hello I'm a network technician in training and I need your help. I am currently learning how to create and manage the different types of VPNs and I have a question about LAN-to-LAN VPNs. The last exercise I practiced was to create a LAN-to-LAN VPN between two Pfsense firewalls, by using IPSEC protocols. I succeeded, with both firewalls in the same network. But I'm wondering if it's possible to do the same thing, with 2 Pfsense firewalls...
1
3182
by: 6302768590 | last post by:
Hai team i want code for transfer the data from one system to another through IP address by using C# our system has to for every 5mins then we have to update the data what the data is updated we have to send another system
2
2555
muto222
by: muto222 | last post by:
How can i add a mobile payment intergratation into php mysql website.
3
2122
bsmnconsultancy
by: bsmnconsultancy | last post by:
In today's digital era, a well-designed website is crucial for businesses looking to succeed. Whether you're a small business owner or a large corporation in Toronto, having a strong online presence can significantly impact your brand's success. BSMN Consultancy, a leader in Website Development in Toronto offers valuable insights into creating effective websites that not only look great but also perform exceptionally well. In this comprehensive...

By using Bytes.com and it's services, you agree to our Privacy Policy and Terms of Use.

To disable or enable advertisements and analytics tracking please visit the manage ads & tracking page.