Output of Document to topic mapping not proper in Mahout(0.9) CVB algo

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

Output of Document to topic mapping not proper in Mahout(0.9) CVB algo

newein
This post has NOT been accepted by the mailing list yet.
This post was updated on .
I am using Mahout(0.9) with hadoop core(0.2) jar. Following code is used to generate the topics using CVB algo provided by Mahout(0.9) :-

<code>
public class CreateTopicsFromDocuments {

public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub

    /*removeStopwords();
    System.out.println("stopwords removed");*/
    createSequenceFiles();
    System.out.println("seq created");
    createVectors();
    System.out.println("vectors created");
    createMatrix();
    System.out.println("matrix created");
    createTopics();
    System.out.println("topics created");
    printTopicMapping();
    System.out.println("Topics to word and document to topics printed");
    printActualWordsForCrcFile();
    System.out.println("actual dictionary words");

}

private static void createSequenceFiles()
{
    String inputDir = "E:\\mahout_output\\inputDir";
    String outputDir = "E:\\mahout_output\\sequenceFiles\\";
    String tempDir = "E:\\mahout_output\\tempDir\\";

    String argsForSeq[]={ "-i" , inputDir //input directory path
            , "-o" , outputDir //output directory
            , "-xm" , "sequential"//mapreduce of sequential
            , "-chunk" , "120"
            , "-prefix" , "KI-"
            , "--tempDir" , tempDir

    };

    SequenceFilesFromDirectory sequenceFilesFromDirectory = new SequenceFilesFromDirectory();
    try {
        ToolRunner.run( sequenceFilesFromDirectory , argsForSeq);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

private static void createVectors()
{      
    String inputDir = "E:\\mahout_output\\sequenceFiles\\";
    String outputDir = "E:\\mahout_output\\vectors\\";
    String tempDir = "E:\\mahout_output\\tempDir\\";

    String argsForSeq[]={ "-i" , inputDir //input directory path
            , "-o" , outputDir //output directory
            , "-chunk" , "120"
            , "-wt" , "TF"
            /*, "-seq"*/
            , "-x" , "80"

    };

    SparseVectorsFromSequenceFiles sparseVectorsFromSequenceFiles = new SparseVectorsFromSequenceFiles();

    try {
        ToolRunner.run( sparseVectorsFromSequenceFiles , argsForSeq);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

private static void createMatrix()
{
    String inputVectorDir = "E:\\mahout_output\\vectors\\";
    String rowIDOutFile = "E:\\mahout_output\\rowIDOutFile\\";

    System.out.println("starting rowID job");
    String[] rowIDArgs = {"--input",inputVectorDir + "/tf-vectors/part-r-00000", "--output", rowIDOutFile};
    try {
        ToolRunner.run(new RowIdJob(), rowIDArgs);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    System.out.println("finished rowID job");
}

private static void createTopics()
{
    String inputVectorDir = "E:\\mahout_output\\rowIDOutFile\\matrix";
    String inputDictionaryDir = "E:\\mahout_output\\vectors\\dictionary.file-*";
    String outputDocTopicsDir = "E:\\mahout_output\\topics\\docTopics\\";
    String outputTopicTermsDir = "E:\\mahout_output\\topics\\topicTerms\\";
    String tempModelPathDir = "E:\\mahout_output\\topics\\tempModelPath\\";

    String argsForSeq[]={
            "-i" , inputVectorDir //input dir for vector
            ,"-dict" , inputDictionaryDir //input dir for dictionary file
            ,"-o" , outputTopicTermsDir // output dir for topic term
            ,"-dt" , outputDocTopicsDir // output dir for doc topic
            ,"-k" , "50" //no of terms
            ,"-x" ,"5" //no of max iter
            ,"-mt" , tempModelPathDir
            ,"-a" , "0.33"
            , "-nt", "39636" //total no of unique words identified in dictionary
            /*,"-ntt" , "10"
            ,"-nut" , "5"*/
            ,"-e" , "0.33"

    };

    CVB0Driver cvb = new CVB0Driver();
    try {
        ToolRunner.run(cvb,argsForSeq);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

private static void printTopicMapping()
{
    String inputDocTopicsDir = "E:\\mahout_output\\topics\\docTopics\\part-m-00000";
    String inputTopicTermsDir = "E:\\mahout_output\\topics\\topicTerms\\part-m-00000";
    String inputDictionaryDir = "E:\\mahout_output\\vectors\\dictionary.file-*";
    String oututDocTopicsDir = "E:\\mahout_output\\topics\\docTopicsReadable";
    String oututTopicTermsDir = "E:\\mahout_output\\topics\\topicTermsReadable";

    try {
        VectorDumper.main(new String[]
                {
                "-i" , inputDocTopicsDir
                , "-o", oututDocTopicsDir
                , "-d", inputDictionaryDir
                , "-dt", "sequencefile"
                , "-sort", "true"
                , "-vs", "100" });

        VectorDumper.main(new String[]
                {
                "-i" , inputTopicTermsDir
                , "-o", oututTopicTermsDir
                , "-d", inputDictionaryDir
                , "-dt", "sequencefile"
                , "-sort", "true"
                , "-vs", "100" });
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

private static void printActualWordsForCrcFile() throws IOException
{
    /*try
    {*/
    int count=0;
    Configuration config = new Configuration();
    Path path = new Path("E:\\mahout_output\\topics\\docTopics\\part-m-00000");
    SequenceFile.Reader reader = null;
    try {
        reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    WritableComparable key = null;
    try {
        key = (WritableComparable) reader.getKeyClass().newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    Writable value = null;
    try {
        value = (Writable) reader.getValueClass().newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    long position = reader.getPosition();

    System.out.println("Starting print");

    while(reader.next(key,value))
    {
        count++;
           System.out.println("Key is: "+key +" value is: "+value+"\n");
    }
    System.out.println("Total count: " + count);
    /*}catch()
    {

    }*/
}
</code>

The output for document to topic mapping seems to be not proper as it only prints the topics which start with a particular alphabet (here a) for all the documents

Sample output:
0 {2d:0.019996671414880783,3d:0.019994853350969108,4d:0.02000171234917903,5d:0.019994290328033588,a.config:0.01999309367417373,a.k.a:0.02000227944902019,a.system:0.01999771644223781,aaa:0.020003361639812457,aam:0.019990182999365072,aapm:0.020012465032122083,aapv:0.01999879522431889,aar:0.019995543474585993,aas:0.019995157547471696,aav:0.02000267326012652,ab:0.020025978185034182,aba:0.01999553819903237,abandon:0.020013355238553677,abandoned:0.01999559962237951,abandonment:0.019994194616256,abandons:0.02001433184497984,abatement:0.01997728075793184,abberationa:0.020001189392395737} 1 {2d:0.020004928417007538,3d:0.019990152266931252,4d:0.02002377332526204,5d:0.01999483347860494,a.config:0.019994652504501834,a.k.a:0.019992844329573715,a.system:0.019996410286150658,aaa:0.0200327086093093,aam:0.020018411539649547,aapm:0.020004991984687574,aapv:0.020029762256626036,aar:0.020002830458456765,aas:0.020012367759737366,aav:0.01999840846269116,ab:0.019975723479625562,aba:0.020001762809931274,abandon:0.019984400000341813,abandoned:0.01999320247062702,abandonment:0.01998444943894258,abandons:0.02000945656334047,abatement:0.019991352339431202,abberationa:0.020015070123334727
}

The document-topics mapping output seems to be composed of all topics starting from a alone.
Am i missing some argument. Please help to correct the output


Thanks