improve performance

This commit is contained in:
Alicia Yang
2021-06-22 17:08:35 +10:00
parent 0b31918fac
commit aef23ab410
8 changed files with 379 additions and 137 deletions

267
app.py
View File

@@ -7,6 +7,7 @@ import dash_uploader as du
import uuid
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State,MATCH, ALL
from dash.exceptions import PreventUpdate
import dash_table
import plotly.graph_objs as go
import dash_daq as daq
@@ -95,6 +96,7 @@ def build_banner(corpus_element,saves_element,loaded):
html.Label("From"),
daq.NumericInput(
id="year-from",
style={"color":"#e85e56"},
className="year-selector",
min=min(year_list),
max=max(year_list),
@@ -102,6 +104,7 @@ def build_banner(corpus_element,saves_element,loaded):
),html.Label("To"),
daq.NumericInput(
id="year-to",
style={"color":"#e85e56"},
className="year-selector",
min=min(year_list),
max=max(year_list),
@@ -534,16 +537,43 @@ def build_tabs():
],
)
def build_tab_1():
# --- build "sentences" tab ---
# search, and show ranking
def build_tab_1(corpus_name):
corpus_ind_dir=corpus[corpus_name]
display_fileds = preprocess_corpus.get_table_fieldnames(corpus_ind_dir)
display_fileds.append("score")
opt=[{'label': f.upper(), 'value': f} for f in display_fileds]
return [
dcc.Store(id='memory-output'),
dcc.Loading(color="rgba(240, 218, 209,0.8)",type="cube",style={"padding-top":"25%"},children=[
html.Div(style={"margin-left": "30px","margin-top": "1.5%", "font-variant": "all-small-caps","color": "#e85e56"},
children="QUERY"),
html.Div(id="current_query",style={"margin-left": "50px", "color": "#f4e9dc","width": "42%"},
children=""),
html.Div(style={"display":"flex"},children=[
html.Div(style={"width": "430px"},
children=[html.Div(style={"margin-left": "30px", "font-variant": "all-small-caps","color": "#e85e56"},
children="DISPLAYED COLUMNS"),
dcc.Dropdown(id="r_table_paras",className="table-dropdown",style={"margin-left":"10%"},
options=opt,value=display_fileds,multi=True)]
),
html.Div(children=[html.Div(style={"font-variant": "all-small-caps","color": "#e85e56"},
children="SENTS PER PAGE"),
daq.NumericInput(id="n_pp",style={"margin-top": "4px","margin-left": "10%","color":"#f4e9dc","width": "40%"},
className="year-selector sent-pp",min=10,max=1000,value=16)]
)
]),
html.Div(
id="rel_sent",style={"margin-left": "20px", "margin-top": "2%","font-variant": "all-small-caps"},className="side-by-side",
id="rel_sent",style={"position": "absolute","right": "1.5%","top": "90px","font-variant": "all-small-caps"},className="side-by-side",
children=[""]
),
html.Div(
id="ranking-table",
id="ranking-table",style={"margin-top":"1%"},
className="output-datatable"
),
]),
@@ -554,106 +584,151 @@ def build_tab_1():
)
]
# --- build "sentences" tab ---
# search, and show ranking
@app.callback([Output("ranking-table", "children"),Output('memory-output', 'data'),
Output("rel_sent","children")],
Output("rel_sent","children"),Output("current_query","children")],
[Input("base-term", 'value'),Input("corpus-select-dropdown","value"),
Input("added-terms-global", "children"),Input("year-from","value"),Input("year-to","value")],
[State("base-term", 'value')]
Input("added-terms-global", "children"),Input("year-from","value"),Input("year-to","value"),
Input("r_table_paras",'value'),Input("n_pp","value")],
[State("base-term", 'value'),
State('memory-output', 'data'),State("rel_sent","children"),State("current_query","children")]
)
def show_ranking(base_term,corpus_name,added,y_from,y_to,t):
def show_ranking(base_term,corpus_name,added,y_from,y_to,cols,npp,t, mo_result,rs,cq):
if corpus_name:
corpus_ind = corpus[corpus_name]
if base_term:
added.append(t) #add base term
if y_from > y_to:
return [html.Div("Please correct the year range."),"",""]
return [html.Div("Please correct the year range."),"","",""]
for term in added: #handle phrases
if "_" in term:
added.append(term.replace("_"," "))
added.remove(term)
result, rel_sent_no, sent_no, rel_article_no, article_no=preprocess_corpus.search_corpus(corpus_ind, added, y_from, y_to)
result_df = pd.DataFrame.from_records(result)
del result_df["Document"]
rel_sent_div=[ html.Div(className="number-card-1",children=[
html.Div(className="number-back",children=[
html.Div(className="number-dis",children=[rel_sent_no]),
html.Div(className="number-label",children=[html.Div("RELEVANT SENTENCES")]),
]),html.Div("|",className="saperator"),
html.Div(className="number-back",children=[
html.Div(className="number-dis2",children=[sent_no]),
html.Div(className="number-label2",children=[html.Div("TOTAL SENTENCES")]),
])
]),
html.Div(className="number-card-2",children=[
html.Div(className="number-back",children=[
html.Div(className="number-dis",children=[rel_article_no]),
html.Div(className="number-label",children=[html.Div("RELEVANT DOCUMENTS")]),
]),html.Div("|",className="saperator"),
html.Div(className="number-back",children=[
html.Div(className="number-dis2",children=[article_no]),
html.Div(className="number-label2",children=[html.Div("TOTAL DOCUMENTS")]),
])
])
]
if len(result)>0:
return [dash_table.DataTable(
id="ranking_table",
sort_action='native',
sort_mode='multi',
filter_action="native",
style_header={"fontWeight": "bold", "color": "inherit","border-bottom":"1px dashed"},
style_as_list_view=True,
fill_width=True,
page_size=10,
style_cell_conditional=[
{"if": {"column_id": "Sentence"}, 'width': '500px',"maxWidth":'500px'},
{"if": {"column_id": "Title"}, "padding-left":"15px","maxWidth":'300px'},
{'if': {'column_id': 'Author'},'maxWidth': '150px'},
{'if': {'row_index': 'odd'},"backgroundColor":"#49494966"}
],
style_cell={
"backgroundColor": "transparent",
"fontFamily": "Open Sans",
"padding": "0 0.2rem",
"color": "#f4e9dc",
"border": "none",
'overflow': 'hidden',
'textOverflow': 'ellipsis',
'width': '55px',
'minWidth': '55px',
'maxWidth': '200px',
"padding-left":"10px",
"textAlign": "left"
},
css=[
{"selector": "tr:hover td", "rule": "color: #e85e56 !important;cursor:pointer;height:10px;"},
{"selector": "td:hover", "rule": "border-bottom: dashed 0px !important;"},
{"selector": ".dash-spreadsheet-container table",
"rule": '--text-color: #e85e56 !important'},
{"selector":".previous-next-container","rule":"float: left;"},
{"selector": "tr", "rule": "background-color: transparent;"},
{"selector": ".current-page", "rule": "background-color: transparent;"},
{"selector":".current-page::placeholder","rule":"color:#e85e56;"},
{"selector": ".column-header--sort","rule":"color: #e85e56; padding-right:3px;"}
],
style_data_conditional=[
{"if": {"state": "active"}, # 'active' | 'selected'
"border": "0px solid"}]+
data_bars(result, 'Score'),
data=result,
columns=[{"id": c, "name": c} for c in result_df.columns],
selected_rows=[],
),result,rel_sent_div]
ctx = dash.callback_context
if not ctx.triggered:
button_id = '' # default
else:
return [html.Div("No result"),"",""]
button_id = ctx.triggered[0]['prop_id'].split('.')[0]
if ((button_id == "r_table_paras")|(button_id == "n_pp")): # no need to search index again, just change layout
result = mo_result[0]
tooltip = mo_result[1]
if len(result)>0:
result_df = pd.DataFrame.from_records(result)
columns_d = [{"id": "id", "name": "id"}]
columns_d.append({"id": "sentence", "name": "sentence"})
for c in result_df.columns:
if ((c != "id")&(c != "sentence")&(c in cols)):
columns_d.append({"id": c, "name": c})
return [build_ranking_table(result,columns_d,tooltip,npp),
mo_result,rs,cq]
else:
return [html.Div("No result"),mo_result,rs,cq]
else: # need to search the index
result, tooltip, rel_sent_no, sent_no, rel_article_no, article_no=preprocess_corpus.search_corpus(corpus_ind, added, y_from, y_to)
result_df = pd.DataFrame.from_records(result)
del result_df["document"]
rel_sent_div=[ html.Div(className="number-card-1",children=[
html.Div(className="number-back",children=[
html.Div(className="number-dis",children=[rel_sent_no]),
html.Div(className="number-label",children=[html.Div("RELEVANT SENTENCES")]),
]),html.Div("|",className="saperator"),
html.Div(className="number-back",children=[
html.Div(className="number-dis2",children=[sent_no]),
html.Div(className="number-label2",children=[html.Div("TOTAL SENTENCES")]),
])
]),
html.Div(className="number-card-2",children=[
html.Div(className="number-back",children=[
html.Div(className="number-dis",children=[rel_article_no]),
html.Div(className="number-label",children=[html.Div("RELEVANT DOCUMENTS")]),
]),html.Div("|",className="saperator"),
html.Div(className="number-back",children=[
html.Div(className="number-dis2",children=[article_no]),
html.Div(className="number-label2",children=[html.Div("TOTAL DOCUMENTS")]),
])
])
]
if len(result)>0:
columns_d = [{"id": "id", "name": "id"}]
columns_d.append({"id": "sentence", "name": "sentence"})
for c in result_df.columns:
if ((c != "id")&(c != "sentence")&(c in cols)):
columns_d.append({"id": c, "name": c})
return [build_ranking_table(result,columns_d,tooltip,npp), [result,tooltip], rel_sent_div," | ".join(added)]
else:
return [html.Div("No result"),"",""," | ".join(added)]
else:
return [html.Div("Please type in the base term in the left pane"),"",""]
return [html.Div("Please type in the base term in the left pane"),"","",""]
else:
return [html.Div("Start by selecting a corpus"),"",""]
return [html.Div("Start by selecting a corpus"),"","",""]
def build_ranking_table(result,columns_d,tooltip,npp):
return dash_table.DataTable(
id="ranking_table",
sort_action='native',
sort_mode='multi',
filter_action="native",
style_header={"fontWeight": "bold", "color": "inherit","border-bottom":"1px dashed"},
style_as_list_view=True,
fill_width=True,
page_size=npp,
style_cell_conditional=[
{"if": {"column_id": "sentence"}, 'width': '500px',"maxWidth":'500px'},
{"if": {"column_id": "title"}, "padding-left":"15px","maxWidth":'300px'},
{'if': {'column_id': 'author'},'maxWidth': '150px'},
{'if': {'row_index': 'odd'},"backgroundColor":"#49494966"}
],
style_cell={
"backgroundColor": "transparent",
"fontFamily": "Open Sans",
"padding": "0 0.2rem",
"color": "#f4e9dc",
"border": "none",
'overflow': 'hidden',
'textOverflow': 'ellipsis',
'width': '55px',
'minWidth': '55px',
'maxWidth': '200px',
"padding-left":"10px",
"textAlign": "left"
},
css=[
#{"selector": ".dash-cell.focused","rule": "background-color: #f4e9dc !important; border:none;"},
{"selector": "table", "rule": "--accent: #e85e56;"},
{"selector": "tr:hover td", "rule": "color: #e85e56 !important; background-color:transparent !important; cursor:pointer;height:10px;"},
{"selector": "td:hover", "rule": "border-bottom: dashed 0px !important;"},
{"selector": ".dash-spreadsheet-container table",
"rule": '--text-color: #e85e56 !important'},
{"selector":".previous-next-container","rule":"float: left;"},
{"selector": "tr", "rule": "background-color: transparent;"},
{"selector": ".current-page", "rule": "background-color: transparent;"},
{"selector":".current-page::placeholder","rule":"color:#e85e56;"},
{"selector": ".column-header--sort","rule":"color: #e85e56; padding-right:3px;"}
],
style_data_conditional=[
{"if": {"state": "active"}, # 'active' | 'selected'
"border": "0px solid"}]+
data_bars(result, 'score'),
data=result,
columns=columns_d,
tooltip_data=tooltip,
tooltip_delay=1000, #1s
tooltip_duration=None,
selected_rows=[]
)
def data_bars(df, column):
Scores=[]
@@ -702,13 +777,11 @@ def data_bars(df, column):
[State('memory-output', 'data')])
def update_graphs(active_cell,data):
if active_cell:
for i in data:
for i in data[0]:
if i['id'] == active_cell['row_id']:
return generate_modal(i['Document'])
return generate_modal(i['document'])
# === Even with no else will give error, but it will close the pop-up automatically ===
# else:
# return [[]]
raise PreventUpdate
# term selection in the sentence pop up window
@app.callback(
@@ -821,6 +894,7 @@ def generate_modal(text=""):
State("added-terms-global", "children")]
)
def add_from_pop(n_clicks,values,added_value_global):
empty_list=[]
new_add=[]
@@ -845,7 +919,10 @@ def add_from_pop(n_clicks,values,added_value_global):
if not phrase in added_value_global:
new_add.append(phrase)
return [empty_list,new_add]
if n_clicks>0:
return [empty_list,new_add]
else:
raise PreventUpdate
# function for checking sentence frequency
@@ -1552,11 +1629,11 @@ def build_big_graph(group_sf_dict,doc_num_year,year_from,year_to):
),
"autosize":False,
"colorway": color_used,#px.colors.qualitative.Prism,
"margin":dict(l=15,r=15,b=10,t=40,pad=4),
"margin":dict(l=55,r=15,b=40,t=40,pad=4),
"legend":dict(font=dict(color="#f4e9dc")),
"template":"plotly_dark",
"width":1100,
"height":290,
"height":330,
"legend":dict(yanchor="top",y=0.99,xanchor="left",x=0.01),
},
}
@@ -1930,7 +2007,7 @@ def update_corpus(data,options):
)
def render_tab_content(tab_switch, corpus_name):
if tab_switch == "tab1":
return [build_tab_1()]
return [build_tab_1(corpus_name)]
if tab_switch == "tab22":
return [build_tab_group()]
if tab_switch == "tab3":

View File

@@ -27,8 +27,6 @@ label{
#big-app-container {
width: 100%;
display: flex;
/*flex-direction: column;*/
/*align-items: center;*/
padding: 0 4rem;
}
@@ -471,7 +469,7 @@ lable {
}
#Overview-tab.custom-tab,
#Specs-tab.custom-tab, #Graph-tab.custom-tab,
#Specs-tab.custom-tab, /*#Graph-tab.custom-tab,*/
#Group-tab.custom-tab, #Control-chart-tab.custom-tab{
margin-right: 3px;
}
@@ -866,7 +864,7 @@ lable {
.output-datatable {
margin: 1.3rem 0rem 0rem 3rem;
line-height:2.4;
line-height:1.5;
align-self: flex-end;
width: 94%;
font-variant: all-petite-caps;
@@ -881,12 +879,13 @@ lable {
.react-numeric-input, .react-numeric-input input {
width: 100% !important;
height: 80%;
}
.react-numeric-input input {
background-color: #f4e9dc !important;
background-color: transparent !important;
border-color: #e85e56 !important;
color: #e85e56 !important;
color: inherit !important;
}
.react-numeric-input b:nth-child(2) i {
@@ -897,6 +896,18 @@ lable {
border-color: #e85e56 transparent transparent !important;
}
.sent-pp div .react-numeric-input input {
border-color: #404040 !important;
font-size: small !important;
}
.sent-pp div .react-numeric-input b:nth-child(2) i {
border-color: transparent transparent #404040 !important;
}
.sent-pp div .react-numeric-input b:nth-child(3) i {
border-color: #404040 transparent transparent !important;
}
/* Input
----------------------------------------------*/
@@ -1056,11 +1067,10 @@ input[type="checkbox"]:checked{
}
.side-side-side2{
display: block;
height: 10%;
min-height: 145px;
height: 25vh;
width: 90%;
margin-top: 10px;
margin-bottom: 10px;
margin-bottom: 16px;
border: solid #e85e56;
border-width: 1px;
border-radius: 4px;
@@ -1126,34 +1136,91 @@ div#added-terms::-webkit-scrollbar-thumb {
background-color: #e85e56;
border-radius:10px;
}
.dash-tooltip {
position: fixed !important;
left:40% !important;
top:50% !important;
border: solid 2px #e85e56 !important;
border-radius: 8px;
background-color: rgba(31,29,36,0.95) !important;
}
.dash-tooltip:before{
border: none !important;
background-color: transparent !important;
}
.dash-tooltip:after{
margin-bottom: 2px;
border: dashed 0px transparent !important;
border-bottom-color: rgba(232, 94, 86,0.8) !important;
background-color: transparent !important;
}
.dash-table-tooltip{
max-width: 600px !important;
background-color: transparent !important;
border: none !important;
border-radius: 5px;
color:#e85e56;
line-height: 25px;
font-size: large;
}
div#ranking-table::-webkit-scrollbar {
width:3px;
height: 5px;
width:15px;
height: 15px;
background:transparent;
margin-left: 3px;
}
div#ranking-table::-webkit-scrollbar-track {
background:none;
}
div#ranking-table:hover ::-webkit-scrollbar-track {
background:#f4e9dc;
border: 1px solid #1e1c24;
background:transparent;
border: 0px solid #1e1c24;
border-radius:10px;
}
div#ranking-table:-webkit-scrollbar-thumb {
background: none;
div#ranking-table::-webkit-scrollbar-corner {
background: transparent;
}
div#ranking-table:hover::-webkit-scrollbar-thumb {
div#ranking-table::-webkit-scrollbar-thumb {
border: 5px solid #1e1c24;
background-color: #e85e56;
border-radius:10px;
}
div#make-groups{
overflow-y: scroll;
max-height: 25vh;
padding-bottom: 15vh;
}
div#make-groups::-webkit-scrollbar {
width:5px;
height: 0px;
background:transparent;
margin-left: 3px;
}
div#make-groups::-webkit-scrollbar-track {
background:transparent;
border: 0px solid #1e1c24;
border-radius:10px;
}
div#make-groups::-webkit-scrollbar-thumb {
border: 0px solid #1e1c24;
background-color: #e85e56;
border-radius:10px;
}
#ranking_table{
padding-right: 10px;
}
div#display-cards::-webkit-scrollbar {
width:3px;
height: 5px;
@@ -1420,14 +1487,66 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
}
.table-dropdown > .Select-control > span {
background: transparent !important;
border-color: #e85e56 !important;
}
.table-dropdown div {
background-color: #1f1d24 !important;
border-color: #404040 !important;
}
.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value {
background-color: #494949;
border-color: #494949;
color: #f4e9dc;
border-radius: 5px;
}
.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-label {
color: #f4e9dc;
font-size: x-small;
}
.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-icon {
border-right: 0px dashed #e85e56;
}
.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-icon:hover {
color: #e85e56;
box-shadow: none;
}
.table-dropdown{
width: 155%;
}
.table-dropdown > .Select-control {
margin-left: 0% !important;
}
.table-dropdown > .Select-menu-outer div {
font-size: small !important;
}
.dash-table-container{
width:100%;
}
#display-cards{
overflow-x: scroll;
display: flex;
margin-left: 2px;
margin-bottom: 10px;
}
#ranking-table{
overflow-x: scroll;
overflow: scroll;
display: flex;
height: 74vh;
}
#display-group-graph{
overflow-x: scroll;
@@ -1451,16 +1570,16 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
.Select-menu-outer ::-webkit-scrollbar {
width:6px;
background:#f4e9dc;
background:#1e1c24;
}
.Select-menu-outer ::-webkit-scrollbar-thumb {
border: 1px solid #e85e56;
border: 0px solid #e85e56;
background-color: #e85e56;
border-radius:10px;
}
.Select-menu-outer::-webkit-scrollbar-track {
background:#f4e9dc;
border: 1px solid #f4e9dc;
.Select-menu-outer ::-webkit-scrollbar-track {
background:#1e1c24;
border: 0px solid #f4e9dc;
border-radius:10px;
}
@@ -1610,10 +1729,9 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
#app-content {
background: inherit;
padding: 0;
width: 101%;
/*max-width: 95%;*/
height: 97%;
overflow-y: scroll;
/*overflow-y: scroll;*/
}
#settings-menu {
@@ -1656,7 +1774,8 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
}
.figure-side-by-side{
margin-top: 25px;
margin-bottom: 10px;
}
.cmSQpo{

Binary file not shown.

BIN
topic_model/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -54,7 +54,7 @@ def build_model(df_full,corpus_name,content_col):
# NLTK Stop words
from nltk.corpus import stopwords
# nltk.download('stopwords')
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

BIN
whoosh_search/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -116,6 +116,7 @@ def delete_corpus_from_app(index_dir):
def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year_col,author_col,add_cols):
doc_no_year={}
sent_no_year={}
doc_len_dict={}
path = os.path.join("./whoosh_search", index_dir)
@@ -169,7 +170,12 @@ def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year
doc_len_dict[doc_len]+=1
else:
doc_len_dict[doc_len]=1
if year in sent_no_year:
sent_no_year[year]+=sen_no
else:
sent_no_year[year]=sen_no
if year in doc_no_year:
doc_no_year[year]+=1
else:
@@ -192,6 +198,12 @@ def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year
f3.write(text)
f3.close()
f4 = open(path+"sent_num", "w")
for year in sent_no_year:
text=year+" "+str(sent_no_year[year])+"\n"
f4.write(text)
f4.close()
print("[ Indexing Finished. In total "+str(line_no)+" documents. ]")
return True
@@ -261,8 +273,12 @@ def filter_corpus(corpus_ind_dir, query_list,year_from, year_to):
# search by query
def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the query term in the list will be connected by OR
def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=1000): #the query term in the list will be connected by OR
import time
start = time.time()
ix = index.open_dir(corpus_ind_dir) #load index
with ix.searcher() as searcher:
@@ -286,11 +302,13 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
q2 = query.Or(term_list_Y)
q_f = query.And([q1,q2])
# search the index
results = searcher.search(q_f,limit=None)
result_list=[]
full_sents =[]
relevant_article_ids=[]
i=0
@@ -304,11 +322,11 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
row_data = {}
row_data["id"] = r["id"]
row_data["Year"] = r["year"]
row_data["Sentence"] = r["content"].lower()#snipet
row_data["Title"] = r["title"].lower()
row_data["Author"] = r["author"]
row_data["Document"] = r["content"].lower()
row_data["year"] = r["year"]
row_data["sentence"] = r["content"].lower()#snipet
row_data["title"] = r["title"].lower()
row_data["author"] = r["author"]
row_data["document"] = r["content"].lower()
for key in r:
if key in ["content", "id", "title", "year", "author"]:
@@ -316,9 +334,12 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
else:
row_data[key]=r[key]
row_data["Score"] = round(r.score,3)
row_data["score"] = round(r.score,3)
result_list.append(row_data)
full_sents.append({"sentence":row_data["document"]})
else:
break
with open(corpus_ind_dir+"/doc_num") as f:
total_doc_no = 0
@@ -332,7 +353,19 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
f.close()
return [result_list, len(results), len(searcher.search(q2,limit=None)), len(relevant_article_ids),total_doc_no]
with open(corpus_ind_dir+"/sent_num") as f:
total_sent_no = 0
lines = f.readlines()
for line in lines:
sent_num=line.strip().split()
if ((int(sent_num[0])>=year_from) & (int(sent_num[0])<=year_to)):
total_sent_no+=int(sent_num[1])
f.close()
print("Results returned:", time.time() - start)
return [result_list, full_sents, len(results), total_sent_no, len(relevant_article_ids),total_doc_no]
def check_sf(corpus_ind_dir,query_list):
query_l=[]
@@ -580,3 +613,9 @@ def get_fieldnames(corpus_ind_dir):
fileds.remove('id')
fileds.remove('title')
return fileds
def get_table_fieldnames(corpus_ind_dir):
fileds=index.open_dir(corpus_ind_dir).schema.stored_names()
fileds.remove('content')
fileds.remove('id')
return fileds

View File

@@ -0,0 +1,7 @@
2012 49
2013 58
2014 49
2015 46
2016 38
2017 72
2018 53