#!/usr/bin/perl -w

# preprocesseur pour la base de donnees Filemaker
# transforme une enquete PFC en fichier CSV pour le schwa et la liaison
# version 0.4.1
# auteur: Julien Eychenne


use strict;
use File::Basename;
use Path::Class;
use File::Path;

use constant FALSE => 0;
use constant TRUE  => 1;

use constant XMIN => 0;
use constant XMAX => 1;
use constant TEXT => 2;
# TextGrid
use constant SCHWA   => 2;
use constant LIAISON => 3;
# categories
use constant PHON   => 0;
use constant LIAIS  => 1;
use constant CAT    => 2;


# enquête
my ($surveyPath, $surveyRF, $file, $filePath);
# infos fichiers et locuteurs
my ($id, $survey, $speaker, $task, $xmin, $xmax, $text, $phon);

my ($cat, $catAv, $catAp, $motPrec, $motSuiv, $k);

# codages schwa
my ($coding, $pos1, $pos2, $pos3, $pos4, $egraph);

# contexte phonetique
my ($schwa, $cgd, $cgi, $cdd, $cdi, $frontAv, $frontAp);

my ($bdlex, $tierRF, $int, $wd, $before, $after, $type, $token, $cnt, $idCnt);

my (@csvSchwa, @csvLiais, @interval);

my %expandedTask = ('l' => 'LIBRE',
					'g' => 'GUIDE',
					't' => 'TEXTE');

## MOTIFS LIAISON

# les delimiteurs pour les mots sont l'espace, la virgule, le point, 
# le point d'interrogation, le chevron fermant, l'apostrophe, ou le tiret.
my $del = " |,|\\.|\\?|>|\'|\\|\\-/";

# consonnes de liaison en SAMPA
my $K = "bcdfgjklmnpqrstvwxzBDFGJKLMNPQRSTVWXZ";

# pause/hÈsitation ou liaison prÈconsonantique
my $H = "hC";

# liaison en [n]
my $N = "nVO|nVN";

# motif de la liaison (cf. bulletin PFC 1 p63)
my $liaison = "[12][0-4](?:[h$K]|$N)[$H]?";

# code liaison
my ($n1, $n2, $cons, $hes, $nas, $precons);

####################################################################
# Main

my $msg =<<EOF;

************************************
** preprocesseur schwa et liaison **
************************************

chargement du lexique en memoire... 
EOF


$surveyPath = $ARGV[0] or do 
{
	print "Chemin du dossier a traiter :\n";
	$surveyPath = <>;
	chomp $surveyPath;
};

print $msg;

$survey = basename($surveyPath);
$bdlex = &loadBdlex;
$surveyRF = &processSurvey($surveyPath);

foreach $filePath (@{$surveyRF})
{	
	$file = basename($filePath);
	
	print "fichier $file...\n";
	
	$file =~ /^(...)(...)(.)/;
	$speaker = $2; $task = $3;
	
	$tierRF = &textgrid2array($filePath, SCHWA);
	
	# traite le textgrid pour le schwa
	for ($int = 0; $int <= $#{$tierRF}; $int++)
	{				
		$idCnt = 1; # le compteur est réinitialisé pour chaque intervalle

		$xmin = $tierRF->[$int][XMIN];
		$xmax = $tierRF->[$int][XMAX];
		$text = $tierRF->[$int][TEXT];
		
		$text = &praat2mac($text);
		
		# découpe l'intervalle en mots. Les commentaires "(...)" sont 
		# considérés comme un délimiteur.
		@interval = split(/(\s?\(.+?\)\s?|\-|\.|\s|\s?\?|,|<|>|\')/, $text);
		
		for ($wd = 0; $wd <= $#interval; $wd++)
		{
			$type = $token = $interval[$wd]; 
			$type =~ s/\d//g;
			
			# modifie la casse si le mot est dans le lexique
			if    (exists $bdlex->{"\l$type"}) {$type = "\l$type"}
			elsif (exists $bdlex->{"\L$type"}) {$type = "\L$type"}
			
			if ( exists $bdlex->{$type} )
			{
				$phon = $bdlex->{$type}[PHON];
				$cat  = $bdlex->{$type}[CAT]; 
			}
			
 			else 
 			{
				$phon = "???"; $cat  = "???";
			}
			
			# pour chaque codage schwa
			while ( $token =~ /(.?)([0-2])([1-5])([1-5])([1-4])/g )
			{
				$egraph = $1;
				$pos1 = $2;
				$pos2 = $3;
				$pos3 = $4;
				$pos4 = $5;
				
				if ($egraph ne "e") 
				{ 
					# groupes <CCe> finals simplifies
					if ( $pos3 == 5 and $token =~ /e(s|nt|)$/ )
					{
						$egraph = "e";
					}
					
					else { $egraph = "x" }
				}
				
				# récupère avant et après le mot dans l'intervalle
				if ($wd == 0) { $before = $motPrec = "BEGIN" }				
				else
				{
					for ($cnt = 0; $cnt < $wd; $cnt++)
					{ $before .= $interval[$cnt] }
					
					$k = 1;
					while ($interval[$wd-$k] =~ /\'|\s|\)|>/) { $k++ }
					( $motPrec = $interval[$wd-$k] ) =~ s/\d//g;
				}		
				
				if ($wd == $#interval) { $after = $motSuiv = "END" }				
				else
				{
					for ($cnt = $wd + 1; $cnt <= $#interval; $cnt++)
					{ $after .= $interval[$cnt] }

					$k = 1 ;
					while ($interval[$wd+$k] =~ /\{|\'|\s|\(|</ 
						and $wd+$k < $#interval) { $k++ }
					( $motSuiv = $interval[$wd+$k] ) =~ s/\d//g;
				}
				
				# récupère les catégories
				if (exists $bdlex->{$motPrec}) { $catAv = $bdlex->{$motPrec}[CAT] }
				else { $catAv = "???" }			

				if (exists $bdlex->{$motSuiv}) { $catAp = $bdlex->{$motSuiv}[CAT] }
				else { $catAp = "???" }	
				
				# ID
				$id = $survey.$speaker.$task.'s-'
				. sprintf("%03d", $int +1) . '-' . sprintf("%02d", $idCnt);
				$idCnt++;


				# contexte phonetique
				($cgd,$cgi,$cdd,$cdi,$frontAv,$frontAp) = findContext($type, $motPrec, $motSuiv, $pos2, $pos4, $pos3) ;

				if ($pos1 == 0) { $schwa = '.' }
				elsif ($pos1 == 1) { $schwa = '@' }
				elsif ($pos1 == 2) { $schwa = '?' }
				
				push @csvSchwa, "$id\t$survey\t$survey$speaker\t$expandedTask{$task}"
					."\t$type\t$egraph\t$pos1\t$pos2\t$pos3\t$pos4"
					."\t$before\t$token\t$after\t$phon\t$catAv\t$cat\t$catAp"
					."\t$cgi\t$frontAv\t$cgd\t$schwa\t$frontAp\t$cdd\t$cdi";
					
				$before = $after = '';
			}
		} # scan token for schwa and liaison
	}
	
	##################
	# codage liaison #
	##################
	$tierRF = &textgrid2array($filePath, LIAISON);
	
	for ($int = 0; $int <= $#{$tierRF}; $int++)
	{				
		$idCnt = 1; # le compteur est réinitialisé pour chaque intervalle

		$xmin = $tierRF->[$int][XMIN];
		$xmax = $tierRF->[$int][XMAX];
		$text = $tierRF->[$int][TEXT];
		
		$text = &praat2mac($text);
		
		# découpe l'intervalle en mots. Les commentaires "(...)" sont 
		# considérés comme un délimiteur.
		@interval = split(/(\s?\(.+?\)\s?|\-|\.|\s|\s?\?|,|<|>|\')/, $text);
		
		for ($wd = 0; $wd <= $#interval; $wd++)
		{
			$token = $interval[$wd]; 

			if ($token =~ /(.+?[^0-9])($liaison)$/)
			{
				$type = $1;
				my $code = $2;
				
				$hes = $nas = $precons = $cons = "";

				$code =~ /([12])([0-4])/;
				$n1 = $1; $n2 = $2;
				if ($code =~ /h/) { $hes = "h";}
				if ($code =~ /C/) { $precons = "C";}
				if ($code =~ /n(VO|VN|)/) { $nas = $1; $cons = "n"; }
				if ($code =~ /($K)/) { $cons = "h";}
				
			# modifie la casse si le mot est dans le lexique
			if    (exists $bdlex->{"\l$type"}) {$type = "\l$type"}
			elsif (exists $bdlex->{"\L$type"}) {$type = "\L$type"}
			
			if ( exists $bdlex->{$type} )
			{
				$phon = $bdlex->{$type}[PHON];
				$cat  = $bdlex->{$type}[CAT]; 
			}
			
 			else 
 			{
				$phon = "???"; $cat  = "???";
			}				
				# récupère avant et après le mot dans l'intervalle
				if ($wd == 0) { $before = $motPrec = "BEGIN" }				
				else
				{
					for ($cnt = 0; $cnt < $wd; $cnt++)
					{ $before .= $interval[$cnt] }
					
					$k = 1;
					while ($interval[$wd-$k] =~ /\'|\s|\)|>/) { $k++ }
					( $motPrec = $interval[$wd-$k] ) =~ s/\d//g;
				}		
				
				if ($wd == $#interval) { $after = $motSuiv = "END" }				
				else
				{
					for ($cnt = $wd + 1; $cnt <= $#interval; $cnt++)
					{ $after .= $interval[$cnt] }

					$k = 1 ;
					while ($interval[$wd+$k] =~ /\{|\'|\s|\(|</ 
						and $wd+$k < $#interval) { $k++ }
					( $motSuiv = $interval[$wd+$k] ) =~ s/\d//g;
				}
				
				# récupère les catégories
				if (exists $bdlex->{$motPrec}) { $catAv = $bdlex->{$motPrec}[CAT] }
				else { $catAv = "???" }			

				if (exists $bdlex->{$motSuiv}) { $catAp = $bdlex->{$motSuiv}[CAT] }
				else { $catAp = "???" }	
				
				# ID
				$id = $survey.$speaker.$task.'l-'
				. sprintf("%03d", $int +1) . '-' . sprintf("%02d", $idCnt);
				$idCnt++;

				push @csvLiais, "$id\t$survey\t$survey$speaker\t$expandedTask{$task}"
					."\t$type\t$before\t$token\t$after\t$phon\t$cat"
					."\t$n1\t$n2\t$cons\t$hes\t$nas\t$precons";
	
				$before = $after = '';
			}
		} # scan token for schwa and liaison
	}	
}

open (OUT, ">schwa-$survey.csv") or die "impossible de creer le fichier: $!\n";

# header pour kexi
print OUT "Occurrence\tEnquete\tLocuteur\tTache"
					."\tType\tGraphie\tPos1\tPos2\tPos3\tPos4"
					."\tContGauche\tToken\tContDroit\tPhonet\tCatPrec\tCat\tCatSuiv"
					."\tCGI\tFrontAv\tCGD\tSchwa\tFrontAp\tCDD\tCDI\n";

for (@csvSchwa) {print OUT "$_\n" }
print "\n$#csvSchwa codages schwa\n";
close OUT;


open (OUT, ">liaison-$survey.csv") or die "impossible de creer le fichier: $!\n";

for (@csvLiais) { print OUT "$_\n" }
print "$#csvLiais codages liaison\n";
close OUT;


# End main
####################################################################


# retourne tous les fichiers TextGrid du dossier
sub processSurvey
{
	my $root = shift;
	my ($speaker, $speakerPath, $file, $filePath);
	my @survey;
	
	opendir (ROOT, $root) 
		or die "Problem opening $root: $!\n";
	
	while ( defined ($speaker = readdir ROOT) )
	{
		$speakerPath = dir($root, $speaker);
		next if ( not -d $speakerPath or $speaker =~ /^\./ );
		
		opendir (SPEAKER, $speakerPath)
			or die "Problem opening $speakerPath: $!\n";
			
		while ( defined ($file = readdir SPEAKER) )
		{
			$filePath = file($speakerPath, $file);
			next if ( $file =~ /^\./ or $file !~ /g\.TextGrid$/i 
				or $file =~ /mg\.TextGrid$/i );
			
			push @survey, $filePath;
		}
		
		closedir SPEAKER;
	}
	
	closedir ROOT;
	
	return \@survey;
}

# charge lexique
sub loadBdlex
{
    my $bdlexicon = "bdlex-utf8.csv";
	my ($ortho, $phon, $liais, $cat, $cpt);
    my %lex;
	
    open(LEX, $bdlexicon)
		or die "Impossible d'ouvrir le fichier $bdlexicon : $!\n";

    while(<LEX>) 
    {
		/^(.*?);(.*?);(.*?);(.*?);/;
		$ortho = $1; $phon = $2; $liais = $3; $cat = $4;
		
#		if ($liais ne "") { $liais = "($liais)" }	
		$lex{$ortho} = [$phon, $liais, $cat];
		$cpt++;
    }

    close(LEX);
	
	# ajouts
	$lex{"parce"} = ['parc@', '', 'CON'];
	
	print "$cpt entrees on ete chargees.\n\n";
	
    return \%lex;
}

# transforme le fichier TextGrid en tableau
sub textgrid2array
{
    my $infile = shift;
    my $tier = shift;
	my $HEADER = TRUE;
	my $THE_TIER  = FALSE;
	my ($interval, $xmin, $xmax, $text);
	my @tier;
	my $next = $tier + 1;
	
    open (TGD, $infile) 
	or die "Probleme a l'ouverture du fichier $infile : $!\n";

    while (<TGD>)
    {
		# Get a tier
		if ( /item \[$tier\]:/ ) { $HEADER = TRUE; $THE_TIER = TRUE }

		elsif ( /item \[$next\]:/ ) { $THE_TIER = FALSE }
		
		elsif ( /intervals \[(\d+)\]:/ and $HEADER = TRUE )
		{
			$HEADER = FALSE;
		}
		
		else { next if $THE_TIER == FALSE or $HEADER == TRUE }

		if ( /xmin = ((\d|\.)+)/ ) { $xmin = $1 }
		
		elsif ( /xmax = ((\d|\.)+)/ ) { $xmax = $1 }
		
		elsif ( /text = \"(.*)\"/ )
		{
			$text = $1;
			push @tier, [$xmin, $xmax, $text];
			$xmin = $xmax = $text = "";
		}
    }
	
	close TGD;
	
	return \@tier;
}


# convert Generic to mac
sub praat2mac
{
	my $ln = shift;
	
    $ln =~ s/\\c,/ç/g;

    $ln =~ s/\\a\^/â/g;
    $ln =~ s/\\e\^/ê/g;
    $ln =~ s/\\i\^/î/g;
    $ln =~ s/\\o\^/ô/g;
    $ln =~ s/\\u\^/û/g;
    
    $ln =~ s/\\a\"\"/ä/g;
    $ln =~ s/\\e\"\"/ë/g;
    $ln =~ s/\\i\"\"/ï/g;
    $ln =~ s/\\o\"\"/ö/g;
    $ln =~ s/\\a\"\"/ü/g;
    
    $ln =~ s/\\e\'/é/g;
 
    $ln =~ s/\\a\`/à/g; 
    $ln =~ s/\\e\`/è/g;
    $ln =~ s/\\u\`/ù/g;

    return $ln;
}


################################################################
## récup ancien code pour le contexte ## schwa uniquement
sub findContext
{

# Voyelles graphiques :                                                     
my $V = "[aeiouyâêîôûéèàäëïöü]|ou|ei|eu|au|eau|ai|eai";                    
# E graphique précodé :                                                     
my $E = "e\\?[0-4\\?]{3}";                                                 
# Consonnes graphiques :                                                    
my $C = "[bcdfgjklmnpqrstvwyz]|ch|sh|sch|qu|ph|gu|ll";                     
# Consonnes doubles graphiques (précédant un schwa) :                       
my $D = "bb|dd|ff|ll|mm|nn|pp|rr|ss|tt";                                   
# Liquides (graphiques) :                                                   
my $L = "[rl]";                                                            
# Obtruante+liquide (graphiques)                                            
my $OL = "(s|)[pbtdcg][rl]";                                               
# Appendice nasal (graphique) :                                             
my $N = "[mn]";                                                            
# Caractères français :                                                     
my $spec = "[âêîôûéèàäëïöüç]";                                             
# Voyelles phonétiques :                                                    
my $VP = "[aAiyuOoeE\@269~]";                                          
# Consonnes phonétiques :                                                   
my $CP = "[jHwpbtdkgfvszSZmnJNlrRx]";   
# Hash associant les consonnes graphiques aux consonnes phonétiques         
my %cons_fin = ( "j" => "il|ï" ,                                            
		 "w" => "" ,                                                
		 "p" => "p" ,                                               
		 "b" => "b" ,                                               
		 "t" => "t" ,                                               
		 "d" => "d" ,                                               
		 "k" => "c|q|k" ,                                           
		 "g" => "g" ,                                               
		 "f" => "f|ph" ,                                            
		 "v" => "v" ,                                               
		 "s" => "s" ,                                               
		 "z" => "z" ,                                               
		 "S" => "ch|sh|sch" ,                                           
		 "Z" => "j" ,                                               
		 "m" => "m" ,                                               
		 "n" => "n" ,                                               
		 "N" => "ng" ,                                              
		 "l" => "l" ,                                               
		 "r" => "r" ,                                               
		 "R" => "r" );                   


my ($mot, $prec, $suiv, $pos, $pos4, $pos3)  = @_;

#if ($pos == 4) { $mot =~ s/.+-// }
#elsif ($pos == 2) { $mot =~ s/-.+// }

### fin de polysyllabe 
if (($pos == 4) and ($pos3 != 5)) { #OK
    # contexte gauche
    if (exists $bdlex->{$mot}) {
		$bdlex->{$mot}[PHON] =~ /(.)(.)(\@|)$/;
		$cgi = $1; 
		$cgd = $2;
    }
    else { $cgi = $cgd = "?" }
    # contexte droit
    if (($pos4 == 3) or ($pos4 == 4)) { 
		$cdd = $cdi = "#";
		$frontAp = "%";
    }
    # liaison en [z] et [t]
    elsif (($mot =~ /s|nt$/) and ($pos4 == 2)
	   and (exists $bdlex->{$suiv}) 
	   and ($bdlex->{$suiv}[PHON] =~ /^($VP|\*)/)) 
    {
	#	if ($mot =~ /s$/) {$cdd = "z"}
	#	elsif ($mot =~ /nt$/) { $cdd = "t" }
		$cdd = $bdlex->{$suiv}[LIAIS];
		$bdlex->{$suiv}[PHON] =~ /^(.)/;
		$cdi = $1;
		$frontAp = "-";
    }
    # améliorer pour gérer les mots absents du lexique !!!!!!!!!!!!
    elsif (not exists $bdlex->{$suiv}) { 
		$cdd = $cdi = "?"; 
		$frontAp = "?"; 
    }
    # cas standard
    else {
		($cdd, $cdi) = sonSuiv($suiv);
		$frontAp = "%";
    }

    $frontAv = "-";
}

### monosyllabes
elsif ($pos == 1) {
	# contexte gauche
    if (exists $bdlex->{$mot}) { 
	    $bdlex->{$mot}[PHON] =~ /^(.)/;
	    $cgd = $1;
	}
	else { $cgd = $1 }
	$cgi = sonPrec($prec);
	# contexte droit
    if ($pos4 =~ /3|4/) { 
		$cdd = $cdi = "#";
    }
    else { ($cdd, $cdi) = sonSuiv($suiv) }

    $frontAv = $frontAp = "%"; 
}

### début de poly
elsif ($pos == 2) { #OK
    if (exists $bdlex->{$mot}) { 
	    if ($bdlex->{$mot}[PHON] =~ /^(.)\@(.)(.)/) {
			$cgd = $1; 
			$cdd = $2; 
			$cdi = $3;
			$cgi = sonPrec($prec);
			$frontAv = "%";
	    }
	    elsif ($bdlex->{$mot}[PHON] =~ /^(.)(.)\@(.)(.)/) {
			$cgi = $1; 
			$cgd = $2; 
			$cdd = $3; 
			$cdi = $4;
			$frontAv = "-";
    	}
    }
    else { 
    	$cgi = $cgd = $cdd = $cdi = "?";
    	$frontAv = "?";
    }
    $frontAp = "-";
}
### syllabe interne
elsif ($pos == 3) {
	# cas de <parce que>
	if ($mot eq "parce")
	{
		if ($pos3 == 1) { $cgi = 'a' } # [pas(@)kX]
		else {$cgi = 'r' } # [pars(@)kX]

		$cgd = 's';
		$cdd = 'k'; 
		$cdi = 'V';
		$frontAv = $frontAp = "-"; # suppose cohésion morphologique
	}
	
    elsif (exists $bdlex->{$mot}) { 
    	$bdlex->{$mot}[PHON] =~ /.*(.)(.)\@(.)(.)/;
    	$cgi = $1;
    	$cgd = $2;
    	$cdd = $3;
    	$cdi = $4;
    	# identifier les frontières morphologiques
    	if ($bdlex->{$mot}[PHON] =~ /\@(ma~|rje|rjO~|rO~)$/) {$frontAp = "%"}
    	else {$frontAp = "?"}
    }
    else {
    	$cgi = $cgd = $cdi = $cdd = "?";
    	$frontAp = "?";
    }
    $frontAv = "-";
}
### simplification consonantique : à faire !!!!
elsif ($pos3 == 5) {
    $cgd = $cgi = $cdd = $cdi = "?";
    $frontAv = "-";
    $frontAp = "%";
}
### autres cas
elsif ($pos == 5) { 
    $cgd = $cgi = $cdd = $cdi = "?";
    $frontAv = "-";
    $frontAp = "?";
}

my @t;
push(@t, $cgd, $cgi, $cdd, $cdi, $frontAv, $frontAp);

return @t;
}


sub sonPrec
{
    my $mot = shift;
    my ($con, $ref);
    
    if ($mot =~ /\.|,|\?/) { $con = "#" } # à améliorer
    
    elsif (($mot eq "euh") or ($mot eq "hein")) {$con = "§"}
    
    else {
		if (exists $bdlex->{$mot}) { 
			$bdlex->{$mot}[PHON] =~ /.*(.)(\@|)$/;
			$con = $1;
    	}
    	else { $con = "?" }
    }
  
    return $con;
}


sub sonSuiv
{
    my $mot = shift;
    my ($cdd, $cdi);
    
    if (($mot eq "euh") or ($mot eq "hein")) {$cdd = $cdi = "\%"} ### voir ???
    else {
       	if (exists $bdlex->{$mot}) {
	    if ($bdlex->{$mot}[PHON] =~ /^(.)$/) {$cdd = $1; $cdi = "?"}
	    if ($bdlex->{$mot}[PHON] =~ /^(.)(.)/) {$cdd = $1; $cdi = $2}
    	}
    	else { $cdd = $cdi = "?" }
	}
    my @t;
    push (@t, $cdd, $cdi);

    return @t;
}


