##KL library(tm) ######### - Import from Gutenberg - without meta information DosSource <- URISource("http://www.gutenberg.org/files/2554/2554.txt") Dostoevsky <- Corpus(DosSource) Dostoevsky[[1]][10:50] ######## - tokenization tf <- termFreq(Dostoevsky[[1]]) head(tf) #### - collocations doslsacorp <- Corpus(VectorSource(as.character(c(Dostoevsky[[1]])))) dtm <- TermDocumentMatrix(doslsacorp) dtm <- removeSparseTerms(dtm, 0.99) lasem <- lsa(dtm) NewMatrix <- as.textmatrix(lasem) tml <-1 for (i in 2:ncol(as.matrix(NewMatrix))) { #falls sich anzahl der docs ändert - die 1680 ändern können auch variablisiert werden if (sum(NewMatrix[,i]) > 0){ tml <- c(tml,i)} } tml x <- associate(NewMatrix[,tml], "Katerina", threshold=0.5) ### - find grep("PART I", as.character(Dostoevsky[[1]])) Dostoevsky[[1]][132] Dostoevsky[[1]][3529] Dostoevsky[[1]][7979] Dostoevsky[[1]][11407] ### - html art <- URISource("http://news.bbc.co.uk/2/hi/health/2284783.stm") article <- Corpus(art) article[[1]][1] article[[1]][10] ### concordances center <- grep("gene", as.character(article[[1]])) concordances <- sapply (center, function(x) substr (article[[1]][x], (gregexpr("gene",article[[1]][x])[[1]][1])-40,(gregexpr("gene",article[[1]][x])[[1]][1])+40 )) concordances ### - rudimentary string manipulation monty <- "Monty Phyton" grail <- "Holy Grail" print(monty) print(grail) print(c(monty, grail)) substring(monty,1,1) substring(monty,7,15) strsplit("Phyton",monty) ### - RSS Feed library(XML) gswurss <- GmaneSource(url("http://blog.wu.ac.at/feed/rss/")) corpwurss <- Corpus(test) corpwurss gswurss$Content names(test) test$Content ??Reutters getReaders() ### - Encoding x <- "fa\xE7ile" Encoding(x) Encoding(x) <- "latin1" x xx <- iconv(x, "latin1", "UTF-8") Encoding(c(x, xx)) c(x, xx) ### - Regular expressions #finger twisters mobile <- c("gold", "golf", "hole", "hold", "anyword", "anotherword") regexpr("^[ghi][mno][jkl][def]$",mobile) mobile[regexpr("^[ghi][mno][jkl][def]$",mobile)!=-1] grep("^[ghi][mno][jkl][def]$",mobile) mobile[grep("^[ghi][mno][jkl][def]$",mobile)] #chatwords chatwords <- c("mine","mmmmiiiiinnnneeee","mmmmmmiiiiiiiiiiiiiinnnnnnnnnnnnnnnneeeeeeeeeeeeee","xmmiinnneee","yours","maxi") regexpr("^m+i+n+e+$",chatwords) grep("^m+i+n+e+$",chatwords) regexpr("m+i+n+e+$",chatwords) grep("m+i+n+e+$",chatwords) regexpr("m+i+n+e",chatwords) grep("m+i+n+e",chatwords) # NLP Chapter 3 # NW R library(tm) library(utils) #load corpus DosSource <- URISource("http://www.gutenberg.org/files/2554/2554.txt") Dostoevsky <- Corpus(DosSource) Dostoevsky[[1]][1:100] start <- grep("PART I$", as.character(Dostoevsky[[1]])) end <- grep("^End of Project Gutenberg'", as.character(Dostoevsky[[1]])) # text sources cap <- Dostoevsky[[1]][start:end] tf <- termFreq(Dostoevsky[[1]], control=list(removePunctuation=TRUE, removeNumbers=TRUE)) tokens <- names(tf) # Section 3.5 - Use of RegExps # find vowels word <- "supercalifragilisticexpialidocious" word strsplit(word,split="[aeiou]") as.vector(strsplit(word, split="")[[1]])[as.vector(gregexpr("[aeiou]", word)[[1]])] gregexpr("[aeiou]", word) length(as.vector(gregexpr("[aeiou]", word)[[1]])) # frequences of two or more vowels vowelcomb2 <- apply(combn(c("a", "e", "i", "o", "u"), 2), 2, paste, collapse="") vowelcomb3 <- apply(combn(c("a", "e", "i", "o", "u"), 3), 2, paste, collapse="") rbind(lapply(sapply(c(vowelcomb2, vowelcomb3), grep,tokens), length)) # Exclude vowels text <- paste(Dostoevsky[[1]][start:(start+10)], collapse=" ") text gsub("[aeiou]", "", text) # conditional frequency tables seta <- c("a", "e", "i", "o", "u") setb <- c("k", "p", "r", "s", "t", "v") settable <- expand.grid(seta, setb) setcomb <- apply(settable, 1, paste, collapse="") cfreqs <- rbind(lapply(sapply(setcomb, grep,tokens), length)) cfd <- matrix(cfreqs, nrow=length(setb), dimnames=list(setb, seta), byrow=T) cfd # Consonant-Vowel Pairs cv_pairs <- sapply(setcomb, grep,tokens, value=T) cv_pairs$uv # Word Stems ##sd <- stemDocument(Dostoevsky[[1]]) head(sd) #gsub("*(ing|ly|ed|ious|ies|ive|es|s|ment)$", "", "processing") strsplit("processing", split="*(ing|ly|ed|ious|ies|ive|es|s|ment)$") # Search for tokens grep("(the) (.*) (man)", Dostoevsky[[1]]) words <- unlist(strsplit(cap, " ")) grep(" <.*> ", words) t <- unique(words[(grep("^the$", words)[grep("^man$", words[(grep("^the$", words)+2)])])+1]) collocation <- function(a,b,text){ i <- intersect(grep(b,text)-1,grep(a,text)) sapply(i,function(x) text[i:(i+1)]) } collocation("^the$", "^man$", words) collo <- function(a,b,words, lag) sort(unique(words[intersect(grep(a, words), grep(b,words)-lag-1)+1])) collo("the", "man", words, 1) # simple approaches to tokenization unlist(strsplit(Dostoevsky[[1]][start:(start+8)], split="[ \t\n]+")) unlist(strsplit(Dostoevsky[[1]][(start+6):(start+8)], split="[[:space:]]", perl=T)) # only using 'non word characters' unlist(strsplit(Dostoevsky[[1]][start:(start+8)], split="\\W", perl=T)) # sentence segmentation raw <- gsub('\"', "", paste(Dostoevsky[[1]][(start+10):(start+66)], collapse=" "), perl=T) sents <- head(unlist(strsplit(raw, split="['.'?''!']( *)", perl=T))) head(sents) # evaluate sentences mean(sapply(sents, nchar)) # count avg characters per sentence mean(sapply(sapply(sents, strsplit, split=" "), length)) # count avg words per sentence # text wrapping paste(words[1:20], " (", nchar(words[1:20]), ")", sep="")