단어기반 계층적클러스터링

Study Info

단어기반 계층적클러스터링

HR분석 0 48 2019.12.12 05:38

library(KoNLP)

library(stringr)

library(tm)

library(rJava)


.jinit()


.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/korean-text-3.0.jar')

.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/java/scala-library-2.11.8.jar')

.jaddClassPath('C:/Users/HR04/Documents/R/win-library/3.6/KoNLP/java/twitter-text-1.11.1.jar')

twitterObj <- .jnew('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder')

twitterObj <- .jrcall(twitterObj, 'build')



kdnedu <- read.csv("C:/data/kdnedu.csv")

edu <- as.data.frame(kdnedu$과정명)


# 트위트 형태소 함수 설정====================================

twi.extractNoun <- function(strings){

  res = .jrcall(twitterObj, 'tokenize', strings)

  res = .jstrVal(res)

  res = str_match_all(res, "([가-힣]+)\\(Noun")[[1]][,2]

  res= res[nchar(res)>=2]

  res

}


word <- apply(edu,1,twi.extractNoun)

twitter_sentence <- sapply(word, function(x) { paste0(as.character(unlist(x)), collapse=" ") })



cps <- Corpus(VectorSource(twitter_sentence)) 

tdm <- TermDocumentMatrix(cps,

                          control=list(tokenize=twi.extractNoun,

                                       wordLengths=c(2, 10)))


tdm.matrix = as.matrix(tdm)

findFreqTerms(tdm, lowfreq = 10)

findAssocs(tdm,"교육",0.25)




# tdm을 dtm으로 전환

dtm = as.DocumentTermMatrix(tdm)


freq=sort(colSums(as.matrix(dtm)), decreasing=TRUE)

wf=data.frame(word=names(freq), freq=freq)

names(wf)




#간단한 막대그래프로 나타내기

#빈도수 50이사

#ggplot

library(ggplot2)


p=ggplot(subset(wf, freq>350), aes(word, freq))

p=p+geom_bar(stat="identity")

p=p+theme(axis.text.x=element_text(angle=45, hjust=1))

p



library(wordcloud)

wordcloud(names(freq), freq, min.freq=20, color=brewer.pal(6, "Dark2"))


########### wordcloud2


library(wordcloud2)

windowsFonts(malgun=windowsFont("맑은고딕"))

wordcloud2(data=wf, minSize = 20, minRotation = 0, maxRotation = 0, rotateRatio = 1, fontFamily='맑은고딕',size=0.8)


#=======================================================

# 단어 기반 계층적 클러스터링

#======================================================= 

# 어느정도 다양한 교육과정속에서 존재하는 단어들만 추린다.

# 스케일링

# 거리 행렬 계산

# 덴드로그램(dendrogram) 플로팅, 5개의 클러스터 만을 추린다.


tdm2 <- removeSparseTerms(tdm, sparse = 0.99)

m2 <- as.matrix(tdm2)

distMatrix <- dist(scale(m2))

fit <- hclust(distMatrix, method = "ward.D2")

par(mfrow=c(1,1)) 

plot(fit)

rect.hclust(fit, k = 5)




#================================================

#k-means 클러스터링

#================================================


m3 <- t(m2)

k <- 4

kmres <- kmeans(m3, k)


round(kmres$centers, digits = 3)


for (i in 1:k) {

  cat(paste("cluster ", i, " : ", sep = ""))

  s <- sort(kmres$centers[i, ], decreasing = T)

  cat(names(s)[1:3], "\n")

  # print(head(rdmTweets[which(kmres$cluster ==i)],n=3))

}



[이 게시물은 HR분석님에 의해 2020-01-26 18:51:02 교육자료에서 복사 됨]

Comments

Study Info

상담 문의


010.9417.2025 hrd04@naver.com