-
Notifications
You must be signed in to change notification settings - Fork 0
/
entity-processing.R
141 lines (109 loc) · 4.26 KB
/
entity-processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#library(rJava)
library(PAMPO)
# Proceed to a recognition of named entities
extractEntities<-function(source, to="extracted_entities")
{
if(idiom=="pt")
{
start.extraction <- Sys.time()
if (data.type == "folder")
entity.positions <- PAMPO_pt(source, 0)
else if (data.type == "text")
entity.positions <- PAMPO_pt(source, 2)
else
stop("Invalid data type")
end.extraction <- Sys.time()
}
else
{
stop("NER is unavailable for this language","\n")
}
extraction.time <- round(difftime(end.extraction, start.extraction, units = "mins"), 3)
#Add the File column, avoiding code changes to deal with text option
if(data.type=="text")
{
File<-"?"
entity.positions<-cbind(File, entity.positions)
}
info1<-paste("Number of entity locations:", nrow(entity.positions))
info2<-paste("Number of analyzed files:", length(unique(entity.positions$File)))
info3<-paste("Number of distinct entities:", length(unique(entity.positions$Entity_desamb)))
info4<-paste("Extraction time:", extraction.time, "minutes")
cat(info1,"\n\n")
cat(info2,"\n\n")
cat(info3,"\n\n")
cat(info4,"\n\n\n")
entity.positions[c(2,3)] <- lapply(entity.positions[c(2,3)], as.integer)
if(to!="") #Positions of named entities are written on a file
{
output.file<-paste(to, ".csv", sep = '')
write.table(entity.positions, output.file, sep=", ", row.names=FALSE,
col.names=TRUE, quote = FALSE, append = FALSE)
}
entity.positions
}
# Get positions of named entities from a former recognition
fileExtraction<-function(file, file.header=TRUE, file.separator=", ")
{
# Data frame imported from a file with information about
#named entities and their positions through a given corpus
extracted.entities<-read.csv(file, header=file.header, sep=file.separator)
extracted.entities[c(4,5)] <- lapply(extracted.entities[c(4,5)],
trimws, which="left")
extracted.entities
}
# Preprocessing of named entities based on elements not found
#in respective sentences and on uninteresting elements
cleanEntities<-function(entities, special.cases=c())
{
entities[c(1,4,5)] <- lapply(entities[c(1,4,5)], as.character)
entities[c(4,5)] <- lapply(entities[c(4,5)], trimws, which="left")
if(length(special.cases)>0)
entities<-entities[-special.cases,]
entities<-entities[!is.element(entities$Entity, exceptions),]
entities
}
#Set the frequency of named entities in each sentence
countEntities<-function(entity.locations)
{
occurrences<-rep(1, nrow(entity.locations))
entity.locations<-cbind(entity.locations, occurrences)
entity.occurrences<-aggregate(occurrences~File+Paragraph+Sentence,
data=entity.locations[,c(1,2,3)], FUN=sum)
entity.occurrences[order(entity.occurrences$File, entity.occurrences$Paragraph,
entity.occurrences$Sentence),]
}
#Verify if counting of named entities is completely correct
confirmOccurrences<-function(entity.locations, entity.counter)
{
right<-TRUE
correct<-sapply(1:nrow(entity.counter), function(i) {
f<-entity.counter[i,"File"]
p<-entity.counter[i,"Paragraph"]
s<-entity.counter[i,"Sentence"]
#Select elements corresponding to the same sentence
location<-entity.locations[entity.locations$File==f &
entity.locations$Paragraph==p &
entity.locations$Sentence==s,]
if(!nrow(location)==entity.counter[i,"occurrences"])
{
wrong<-paste("There's a wrong number of occurrences at line",
i, "on data frame.\n")
cat(wrong)
right<-FALSE
}
right})
Reduce("&", correct)
}
#Preserve sentences with certain frequencies of named entities
filterSentences<-function(entity.locations, sentence.entities, entity.number=c(2))
{
#To maintain ordering
entity.locations$ID <- 1:nrow(entity.locations)
#Choose the exact sentences
sentence.entities<-sentence.entities[is.element(sentence.entities$occurrences,
entity.number),]
entity.locations<-merge(entity.locations, sentence.entities,
by=c("File", "Paragraph", "Sentence"))
entity.locations[order(entity.locations$ID),-c(6,7)]
}