## Statistical Natural Language Processing ## Unit 1: tm and plugins ## Part 2 ## Author: Stefan Theussl ################################################################################ ## (1) Hadoop InteractiVE ################################################################################ ## note that Hadoop has to be installed and configured appropriately library("hive") hive_start() hive_is_available() DFS_list("/") DFS_dir_remove("/tmp") DFS_put( "~/Data/Reuters/minimal", "/tmp/Reuters" ) DFS_list("/tmp/Reuters") head(DFS_read_lines("/tmp/Reuters/reut-00002.xml")) ################################################################################ ## (2) tm.plugin.dc ################################################################################ library( "tm.plugin.dc" ) ## "classic" Corpus construction corpus <- Corpus( DirSource("~/Data/Reuters/minimal"), list(reader = readReut21578XMLasPlain) ) summary(corpus) ## 'DistributedCorpus' Construction dc <- DistributedCorpus(DirSource("~/Data/Reuters/minimal"), list(reader = readReut21578XMLasPlain) ) summary(dc) ## coerce 'VCorpus' to 'DistributedCorpus' dc <- as.DistributedCorpus( corpus ) summary(dc) ## note that old dc is NOT deleted on (D)FS ## Corpus methods length(dc) print(dc) dc[[2]] dc <- tm_map(dc, stemDocument) dc[[2]] ## Two concepts: distributed storage and parallel computation dc_storage( dc ) ## create new storage for dc args( dc_storage_create ) storage <- dc_storage_create( "HDFS", "/tmp/dc", chunksize = 10 * 1024^2 ) summary( storage ) ## move data to dc dc_storage(dc) <- storage summary(dc) dc[[2]] ## download corpus from dc and coerce to 'VCorpus' ## note that there should be enough memory on the system corpus <- as.Corpus(dc) corpus[[2]] ## Parallel computation ## currently only enabled for HDFS storage types library( "tm.corpus.Reuters21578" ) data( "Reuters21578" ) system.time( tm_map(Reuters21578, stemDocument) ) system.time( tm_map(as.DistributedCorpus(Reuters21578), stemDocument) ) dc <- as.DistributedCorpus(Reuters21578, storage = storage) ## DTM Construction dtm <- DocumentTermMatrix( Reuters21578 ) dtm_dc <- DocumentTermMatrix( dc ) tm.plugin.dc:::TermDocumentMatrix.DistributedCorpus ## Revisions getRevisions(dc) ################################################################################ ## (3) tm.corpora ################################################################################ library("tm") data(crude) meta(crude) crude[[1]] meta(crude[[1]]) ## Reuters 21578 library("tm.corpus.Reuters21578") data( Reuters21578 ) Reuters21578 class( Reuters21578 ) Reuters21578[[ 3 ]] ## pre-constructed DTM with preprocessing steps: stemming, stopword removal, ## number removal, tolower, punctuation removal data( Reuters21578_DTM ) ## Reuters RCV1 library( "tm.corpus.RCV1" ) data( "RCV1" ) RCV1 class(RCV1) print( object.size(RCV1), units = "Mb" ) ## Reuters RCV1 DTM data( RCV1_DTM ) dim( RCV1_DTM ) print( object.size(RCV1_DTM), units = "Mb" ) path_to_storage <- file.path(system.file( package = "tm.corpus.RCV1" ), "dc", getRevisions(RCV1)) dir( path_to_storage ) lines <- readLines( file.path(path_to_storage, "part-1_1") ) lines[[2]] ################################################################################ ## (4) Chapter 2 ################################################################################ ## crude corpus (named list) data( crude ) ## file ids names(crude) ## subset by file id crude[["reut-00001.xml"]] ## find out how many words doc 1 contains ## Appropriate 'word' method will be available in tm soon ## define tokenizer MC (http://www.cs.utexas.edu/users/dml/software/mc/) MC_tokenizer <- tm:::MC_tokenizer MC_tokenizer( crude[[ "reut-00004.xml" ]] ) ## length of a text, number of tokens? length( MC_tokenizer( crude[[ "reut-00004.xml" ]] ) ) ## distinct words or sorted vector of vocabulary sort(unique( MC_tokenizer( crude[[ "reut-00004.xml" ]] ) )) ## lexical diversity lexical_diversity <- function(text){ length( MC_tokenizer(text) ) / length( unique(MC_tokenizer(text)) ) } sapply(crude, lexical_diversity) ## NSF Corpus (available from the datacube repository) library("tm.corpus.NSF") data("NSF_Part1") lexical_diversity(NSF_Part1[[1]]) ## display information about text ## average word length unlist(lapply(crude, nchar)) / unlist(lapply(crude, function(x) length(MC_tokenizer(x)))) ## etc. ################################################################################ ## (5) Wordlist Corpora ################################################################################ ## stopwords head( stopwords(language = "en"), n = 20 ) head( stopwords(language = "de"), n = 20 ) ## other dictionaries (available from datacube repository) library("tm.plugin.tags") head( tm_get_tags("Positiv", collection = "general_inquirer"), n = 20 ) tm_tag_score( dc[[1]], tm_get_tags("Positiv", collection = "general_inquirer") )