library(KoNLP)
library(stringr)
library(tm)
library(rJava)
.jinit()
.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/korean-text-3.0.jar')
.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/java/scala-library-2.11.8.jar')
.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/twitter-text-1.11.1.jar')
twitterObj <- .jnew('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder')
twitterObj <- .jrcall(twitterObj, 'build')
kdnedu <- read.csv("C:/data/kdnedu.csv")
edu <- as.data.frame(kdnedu$과정명)
# 트위트 형태소 함수 설정====================================
twi.extractNoun <- function(strings){
res = .jrcall(twitterObj, 'tokenize', strings)
res = .jstrVal(res)
res = str_match_all(res, "([가-힣]+)\\(Noun")[[1]][,2]
res= res[nchar(res)>=2]
res
}
word <- apply(edu,1,twi.extractNoun)
twitter_sentence <- sapply(word, function(x) { paste0(as.character(unlist(x)), collapse=" ") })
cps <- Corpus(VectorSource(twitter_sentence))
tdm <- TermDocumentMatrix(cps,
control=list(tokenize=twi.extractNoun,
wordLengths=c(2, 10)))
tdm.matrix = as.matrix(tdm)
findFreqTerms(tdm, lowfreq = 10)
findAssocs(tdm,"교육",0.25)
# tdm을 dtm으로 전환
dtm = as.DocumentTermMatrix(tdm)
freq=sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wf=data.frame(word=names(freq), freq=freq)
names(wf)
#간단한 막대그래프로 나타내기
#빈도수 50이사
#ggplot
library(ggplot2)
p=ggplot(subset(wf, freq>350), aes(word, freq))
p=p+geom_bar(stat="identity")
p=p+theme(axis.text.x=element_text(angle=45, hjust=1))
p
library(wordcloud)
wordcloud(names(freq), freq, min.freq=20, color=brewer.pal(6, "Dark2"))
########### wordcloud2
library(wordcloud2)
windowsFonts(malgun=windowsFont("맑은고딕"))
wordcloud2(data=wf, minSize = 20, minRotation = 0, maxRotation = 0, rotateRatio = 1, fontFamily='맑은고딕',size=0.8)
#=======================================================
# 단어 기반 계층적 클러스터링
#=======================================================
# 어느정도 다양한 교육과정속에서 존재하는 단어들만 추린다.
# 스케일링
# 거리 행렬 계산
# 덴드로그램(dendrogram) 플로팅, 5개의 클러스터 만을 추린다.
tdm2 <- removeSparseTerms(tdm, sparse = 0.99)
m2 <- as.matrix(tdm2)
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward.D2")
par(mfrow=c(1,1))
plot(fit)
rect.hclust(fit, k = 5)
#================================================
#k-means 클러스터링
#================================================
m3 <- t(m2)
k <- 4
kmres <- kmeans(m3, k)
round(kmres$centers, digits = 3)
for (i in 1:k) {
cat(paste("cluster ", i, " : ", sep = ""))
s <- sort(kmres$centers[i, ], decreasing = T)
cat(names(s)[1:3], "\n")
# print(head(rdmTweets[which(kmres$cluster ==i)],n=3))
}