import scalanlp.io._; import scalanlp.stage._; import scalanlp.stage.text._; import scalanlp.text.tokenize._; import scalanlp.pipes.Pipes.global._; import edu.stanford.nlp.tmt.stage._; import edu.stanford.nlp.tmt.model.lda._; import edu.stanford.nlp.tmt.model.llda._; val source = CSVFile("ebay-10.txt"); println("Load: " + source + " contains " + source.data.size + " records"); val text = { source ~> // read from the source file Column(2) ~> // select column containing text TokenizeWith(WhitespaceTokenizer()) ~> // tokenize TermCounter() ~> // collect counts (needed below) TermMinimumDocumentCountFilter(5) ~> // filter terms in <5 docs // TermDynamicStopListFilter(30) ~> // filter out 30 most common terms DocumentMinimumLengthFilter(5) // take only docs with >=5 terms } // define fields from the dataset we are going to slice against val labels = { source ~> // read from the source file Column(1) ~> // take column one, the labels TokenizeWith(WhitespaceTokenizer()) ~> // turns label field into an array TermCounter() ~> // collect label counts TermMinimumDocumentCountFilter(10) // filter labels in < 10 docs } val dataset = LabeledLDADataset(text, labels); // define the model parameters val modelParams = LabeledLDAModelParams(dataset); // Name of the output model folder to generate val modelPath = file("llda-cvb0-"+dataset.signature+"-"+modelParams.signature); // Trains the model, writing to the given output path TrainCVB0LabeledLDA(modelParams, dataset, output = modelPath, maxIterations = 1000); // or could use TrainGibbsLabeledLDA(modelParams, dataset, output = modelPath, maxIterations = 1500);