Xichuan has posted 1 posts at DZone. View Full User Profile

Work with Joshua (SMT)

12.21.2011
| 1598 views |
  • submit to reddit

I have been working on Joshua, a toolkit for SMT. Before extracting grammar from parallel corpus, one necessary step is to eliminate sentences of more than 100 words. For Hansard, it is common that you will encounter sentences like that. So one needs to implement a function to do filtering. Here is what I did.

 package util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;

public class LongSentenceFilter {

    public void filter(File enFile, File frFile, File oenFile, File ofrFile) {
        StringBuffer enContent = new StringBuffer();
        StringBuffer frContent = new StringBuffer();
        int sentenceCount = 0;
        int lineCount = 0;
        BufferedReader input;
        String line = null;
        ArrayList<Integer> longSentenceIndices = new ArrayList<Integer>();
        try {
            input = new BufferedReader(new FileReader(enFile));

            while ((line = input.readLine()) != null) {
                String[] words = line.split(" ");
                lineCount++;
                if (words.length < 100) {
                    enContent.append(line);
                    enContent.append('\n');
                    sentenceCount++;
                } else {
                    longSentenceIndices.add(lineCount);
                }
            }
            System.out.println("English lineCount: " + lineCount);
            lineCount = 0;
            System.out.println("English sentenceCount: " + sentenceCount);
            sentenceCount = 0;
            input.close();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        try {
            input = new BufferedReader(new FileReader(frFile));
            
            while ((line = input.readLine()) != null) {
            lineCount++;
            if (!longSentenceIndices.contains(lineCount)) {
                frContent.append(line);
                frContent.append('\n');
                sentenceCount++;
            }
        }

        System.out.println("French lineCount: " + lineCount);
        lineCount = 0;
        System.out.println("French sentenceCount: " + sentenceCount);
        sentenceCount = 0;
        input.close();
        } catch (FileNotFoundException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        

        Writer output;
        try {
            output = new BufferedWriter(new FileWriter(oenFile));
            output.write(enContent.toString());
            output.close();

            output = new BufferedWriter(new FileWriter(ofrFile));
            output.write(frContent.toString());
            output.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        LongSentenceFilter filter = new LongSentenceFilter();
        File enFile = new File("test/input/hansard.e.tok.lc");
        File oenFile = new File("test/output/hansard.e.tok.lc.filtered");

        File frFile = new File("test/input/hansard.f.tok.lc");
        File ofrFile = new File("test/output/hansard.f.tok.lc.filtered");

        filter.filter(enFile, frFile, oenFile, ofrFile);
    }

}

 

 

 

0
Published at DZone with permission of its author, Xichuan Wu.

(Note: Opinions expressed in this article and its replies are the opinions of their respective authors and not those of DZone, Inc.)