/*
 * lang_guess.c - a utility for guessing the language in which a section of
 *                text is writeen.
 *
 * Written by Steve Underwood <steveu@coppice.org>
 *
 * Copyright (C) 2001 Steve Underwood
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

/* This is a C implementation of the Perl program implementing the same
   technique for assessing a language by Gertjan van Noord.

   ftp://ftp.let.rug.nl/pub/vannoord/TextCat/
   http://www.let.rug.nl/~vannoord/TextCat/

   The C codeis faster, and can easily be turned into a library for use within
   larger applications. This version will eventually become such a library. It
   can almost certainly be subjected to significant optimisation!
   
   Strengths: This technique is reasonably fast and accurate.
   
   Weaknesses: It is rather subject to quirky behaviour with "non standard" text.
   For example, the trained language data supplied was trained on typical text for
   most European languages. Feed it text in all upper case, and it gets confused.
   Feed it the output from a SMS controller, and the heavy use of abbreviations
   fools it (I know - I tried!). You can retain the data for your own environment
   and it will then give good results on similar material. Try to generalise it too
   much, though, and its accuracy may fall.
   
   The Chinese trained data was trained with spaces between each character, which
   is hardly typical. The software certainly needs more intelligence in its handling
   of this type of situation.
 */

#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <stdint.h>
#include <sys/socket.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <dirent.h>
#include <utime.h>

#define FALSE   0
#define TRUE    (!FALSE)

int verbose = FALSE;
int max_tested_ngrams = 400;
int max_results = 10;
int occurance_threshold = 0;
float hit_ratio = 1.05;

static char *lm_path = "./LM";
static char *non_word_characters = "0123456789 \t\n";

typedef struct
{
    char *name;
    int score;
} lang_desc_t;

static int result_cmp (const void *one, const void *two)
{
    return ((lang_desc_t *) two)->score - ((lang_desc_t *) one)->score;
}
/*- End of function --------------------------------------------------------*/

static int name_cmp (const void *one, const void *two)
{
    return strcmp(((lang_desc_t *) one)->name, ((lang_desc_t *) two)->name);
}
/*- End of function --------------------------------------------------------*/

inline int inserter (lang_desc_t *ngram, int ngrams, char *text, int len)
{
    int i;
    
    for (i = 0;  i < ngrams;  i++)
    {
        if (strncmp(ngram[i].name, text, len) == 0)
        {
            ngram[i].score++;
            return ngrams;
        }
        /*endif*/
    }
    /*endfor*/

    ngram[ngrams].name = malloc (len + 1);
    strncpy(ngram[ngrams].name, text, len);
    ngram[ngrams].name[len] = '\0';
    ngram[ngrams].score = 1;
    return  ngrams + 1;
}
/*- End of function --------------------------------------------------------*/

static int create_lm (const char *text, lang_desc_t *ngram)
{
    char *sorted;
    const char *s;
    char *t;
    int i;
    int j;
    int len;
    int flen;
    int ngrams;
    char word[256 + 1];
    char buf[6];

    ngrams = 0;

    /* ngram contains reference to the hash we build,
       then add the ngrams found in each word in the hash */
    s = text;
    while (*s)
    {
        t = word;
        *t++ = '_';
        while (isspace(*s))
            s++;
        /*endwhile*/
        while (!isspace(*s)  &&  *s)
            *t++ = *s++;
        /*endwhile*/
        *t++ = '_';
        *t++ = '\0';
        
        len = strlen(word);
        flen = len;
        for (i = 0;  i < flen;  i++)
        {
            if (len > 4)
                ngrams = inserter (ngram, ngrams, word + i, 5);
            /*endif*/
            if (len > 3)
                ngrams = inserter (ngram, ngrams, word + i, 4);
            /*endif*/
            if (len > 2)
                ngrams = inserter (ngram, ngrams, word + i, 3);
            /*endif*/
            if (len > 1)
                ngrams = inserter (ngram, ngrams, word + i, 2);
            /*endif*/
            ngrams = inserter (ngram, ngrams, word + i, 1);
            len--;
        }
        /*endfor*/
    }
    /*endwhile*/

    if (verbose)
        fprintf (stderr, "Count ngrams done - %d\n", ngrams);
    /*endif*/

    /* As suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
       sorting by removing singletons */
    /* However I have very bad results for short inputs, this way */
    if (occurance_threshold)
    {
        qsort (ngram, ngrams, sizeof(*ngram), result_cmp);
        for (i = 0;  i < ngrams;  i++)
        {
            if (ngram[i].score < occurance_threshold)
            {
                 ngrams = i;
                 break;
            }
            /*endif*/
        }
        /*endfor*/
    }
    /*endif*/
    if (ngrams > max_tested_ngrams)
    {
        /* Sort the ngrams, and spit out the most frequent ones. */
        qsort (ngram, ngrams, sizeof(*ngram), result_cmp);
        ngrams = max_tested_ngrams;
    }
    /*endif*/
    qsort (ngram, ngrams, sizeof(*ngram), name_cmp);
    if (verbose)
        fprintf (stderr, "Sorting done \n");
    /*endif*/

    return ngrams;
}
/*- End of function --------------------------------------------------------*/

char *guess_language (const char *text)
{
    int i;
    int j;
    int p;
    int len;
    DIR *dirp;
    struct dirent *direntp;
    lang_desc_t language[1000];
    int languages;
    lang_desc_t language_ngram[10000];
    int language_ngrams;
    lang_desc_t input_ngram[10000];
    lang_desc_t *test_ngram;
    int input_ngrams;
    char *unknown;
    char path[256 + 1];
    FILE *in;
    char buf[132 + 1];
    char *s;

    /* Open directory to find which languages are supported */
    languages = 0;
    dirp = opendir (lm_path);
    if (dirp == NULL)
    {
        fprintf (stderr, "Failed to open %s - %s", lm_path, strerror (errno));
        return  NULL;
    }
    /*endif*/
    direntp = readdir (dirp);
    while (direntp)
    {
        len = strlen (direntp->d_name);
        if (len > 3
            &&
            direntp->d_name[len - 3] == '.'
            &&
            direntp->d_name[len - 2] == 'l'
            &&
            direntp->d_name[len - 1] == 'm')
        {
            language[languages].name = strdup (direntp->d_name);
            language[languages++].score = 0;
        }
        /*endif*/
        direntp = readdir (dirp);
    }
    /*endwhile*/
    closedir (dirp);

    if (languages == 0)
    {
        fprintf (stderr, "No language models found.\n");
        exit (2);
    }
    /*endif*/
    if (verbose)
    {
        fprintf (stderr, "There are %d languages\n", languages);
        for (i = 0;  i < languages;  i++)
            fprintf (stderr, "Language %d is %s\n", i, language[i].name);
        /*endfor*/
    }
    /*endif*/
    
    /* Create ngrams for input. Note that hash %unknown is not used;
       it contains the actual counts which are only used under -n: creating
       new language model (and even then they are not really required). */
    input_ngrams = create_lm(text, input_ngram);

    /* Load each language model and each test for that language. */
    for (i = 0;  i < languages;  i++)
    {
        /* Load the language model into hash %$language. */
        sprintf(path, "%s/%s", lm_path, language[i].name);
        in = fopen(path, "r");
        if (in == NULL)
        {
            fprintf(stderr, "Cannot open %s\n", path);
            exit(2);
        }
        /*endif*/

        language_ngrams = 0;
        while (fgets (buf, 132, in))
        {
            /* Use only lines starting with appropriate characters. Others are
               ignored. */
            if (strchr(non_word_characters, buf[0]) == NULL)
            {
                s = strpbrk(buf, " \t");
                if (s)
                {
                    *s = '\0';
    	            language_ngram[language_ngrams].name = strdup(buf);
	            language_ngram[language_ngrams].score = language_ngrams + 1;
                    language_ngrams++;
                }
                /*endif*/
            }
            /*endif*/
        }
        /*endwhile*/
        fclose(in);
        qsort (&language_ngram,
               language_ngrams,
               sizeof(language_ngram[0]),
               name_cmp);

        /* Compare the language model with the input ngrams list */
        p = 0;
        for (j = 0;  j < input_ngrams;  j++)
        {
            test_ngram = bsearch (&input_ngram[j],
                                  language_ngram,
                                  language_ngrams,
                                  sizeof(language_ngram[0]),
                                  name_cmp);
            if (test_ngram)
                p += abs(test_ngram->score - j);
            else
	        p += max_tested_ngrams;
            /*endif*/
        }
        /*endfor*/
        language[i].score = p;
        /* Free the allocated language model ngram data */
        for (j = 0;  j < language_ngrams;  j++)
            free (language_ngram[j].name);
        /*endfor*/
    }
    /*endfor*/

    /* Free the allocated input ngram data */
    for (j = 0;  j < input_ngrams;  j++)
        free (input_ngram[j].name);
    /*endfor*/

    if (verbose)
    {
        qsort (&language, languages, sizeof(language[0]), name_cmp);
        for (i = 0;  i < languages;  i++)
            fprintf (stderr, "%-30s scored %d\n", language[i].name, language[i].score);
       /*endfor*/
    }
    /*endif*/

    qsort (&language, languages, sizeof(language[0]), result_cmp);

    for (i = languages - 2;  i >= 0;  i--)
    {
        if (language[languages - 1].score*hit_ratio < language[i].score)
            break;
        /*endif*/
    }
    /*endfor*/
    if (languages - 1 - i > max_results)
    {
        /* Language not clearly recognised */
        return  NULL;
    }
    /*endif*/
    return  language[languages - 1].name;
}
/*- End of function --------------------------------------------------------*/

//#if defined(XYZZY)
int main (int argc, char *argv[])
{
    int in;
    char *lang;
    char buf[100000];
    int len;

    sprintf(buf, "/home/steveu/festival/text_cat/ShortTexts/%s.txt", argv[1]);
    in = open(buf, O_RDONLY);
    if (in < 0)
    {
        printf("Cannot open %s\n", buf);
        exit(2);
    }
    len = read(in, buf, 100000);
    buf[len] = '\0';
    close (in);
    printf("Processing %d bytes\n", len);
    lang = guess_language (buf);
    if (lang)
        printf ("Language is %s\n", lang);
    else
        printf ("Language not recognised\n");
    /*endif*/
}
/*- End of function --------------------------------------------------------*/
//#endif
/*- End of file ------------------------------------------------------------*/
