improve performance

2022-02-20 22:06:42 +03:00 · 2021-06-22 17:08:35 +10:00
parent 0b31918fac
commit aef23ab410
8 changed files with 379 additions and 137 deletions
--- a/app.py
+++ b/app.py
@@ -7,6 +7,7 @@ import dash_uploader as du
 import uuid
 import dash_bootstrap_components as dbc
 from dash.dependencies import Input, Output, State,MATCH, ALL
+from dash.exceptions import PreventUpdate
 import dash_table
 import plotly.graph_objs as go
 import dash_daq as daq
@@ -95,6 +96,7 @@ def build_banner(corpus_element,saves_element,loaded):
                    html.Label("From"),
                    daq.NumericInput(
                        id="year-from",
+                        style={"color":"#e85e56"},
                        className="year-selector",
                        min=min(year_list),
                        max=max(year_list),
@@ -102,6 +104,7 @@ def build_banner(corpus_element,saves_element,loaded):
                    ),html.Label("To"),
                    daq.NumericInput(
                        id="year-to",
+                        style={"color":"#e85e56"},
                        className="year-selector",
                        min=min(year_list),
                        max=max(year_list),
@@ -534,16 +537,43 @@ def build_tabs():
        ],
    )

-def build_tab_1():
+# --- build "sentences" tab ---
+# search, and show ranking
+
+def build_tab_1(corpus_name):
+    corpus_ind_dir=corpus[corpus_name]
+    display_fileds = preprocess_corpus.get_table_fieldnames(corpus_ind_dir)
+    display_fileds.append("score")
+    opt=[{'label': f.upper(), 'value': f} for f in display_fileds]
+    
    return [
        dcc.Store(id='memory-output'),
        dcc.Loading(color="rgba(240, 218, 209,0.8)",type="cube",style={"padding-top":"25%"},children=[
+            html.Div(style={"margin-left": "30px","margin-top": "1.5%", "font-variant": "all-small-caps","color": "#e85e56"},
+                children="QUERY"),
+            html.Div(id="current_query",style={"margin-left": "50px", "color": "#f4e9dc","width": "42%"},
+                children=""),
+            html.Div(style={"display":"flex"},children=[
+                html.Div(style={"width": "430px"},
+                    children=[html.Div(style={"margin-left": "30px", "font-variant": "all-small-caps","color": "#e85e56"},
+                                            children="DISPLAYED COLUMNS"),
+                                    dcc.Dropdown(id="r_table_paras",className="table-dropdown",style={"margin-left":"10%"},
+                                                options=opt,value=display_fileds,multi=True)]
+                        ),
+                html.Div(children=[html.Div(style={"font-variant": "all-small-caps","color": "#e85e56"},
+                                            children="SENTS PER PAGE"),
+                                    daq.NumericInput(id="n_pp",style={"margin-top": "4px","margin-left": "10%","color":"#f4e9dc","width": "40%"},
+                                            className="year-selector sent-pp",min=10,max=1000,value=16)]
+                        )
+                ]),
+            
+            
            html.Div(
-                id="rel_sent",style={"margin-left": "20px", "margin-top": "2%","font-variant": "all-small-caps"},className="side-by-side",
+                id="rel_sent",style={"position": "absolute","right": "1.5%","top": "90px","font-variant": "all-small-caps"},className="side-by-side",
                children=[""]
            ),
            html.Div(
-                id="ranking-table",
+                id="ranking-table",style={"margin-top":"1%"},
                className="output-datatable"
            ),
            ]),
@@ -554,106 +584,151 @@ def build_tab_1():
        )
    ]

-# --- build "sentences" tab ---
-# search, and show ranking

@app.callback([Output("ranking-table", "children"),Output('memory-output', 'data'),
-                Output("rel_sent","children")],
+                Output("rel_sent","children"),Output("current_query","children")],
              [Input("base-term", 'value'),Input("corpus-select-dropdown","value"),
-              Input("added-terms-global", "children"),Input("year-from","value"),Input("year-to","value")],
-              [State("base-term", 'value')]
+              Input("added-terms-global", "children"),Input("year-from","value"),Input("year-to","value"),
+              Input("r_table_paras",'value'),Input("n_pp","value")],
+              [State("base-term", 'value'),
+               State('memory-output', 'data'),State("rel_sent","children"),State("current_query","children")]
              )
-def show_ranking(base_term,corpus_name,added,y_from,y_to,t):
+def show_ranking(base_term,corpus_name,added,y_from,y_to,cols,npp,t, mo_result,rs,cq):
+
    if corpus_name:
        corpus_ind = corpus[corpus_name]
        if base_term:
            added.append(t) #add base term
            if y_from > y_to:
-                return [html.Div("Please correct the year range."),"",""]
+                return [html.Div("Please correct the year range."),"","",""]
            for term in added: #handle phrases
                if "_" in term:
                    added.append(term.replace("_"," "))
                    added.remove(term)
            
-            result, rel_sent_no, sent_no, rel_article_no, article_no=preprocess_corpus.search_corpus(corpus_ind, added, y_from, y_to)
-            result_df = pd.DataFrame.from_records(result)
-            del result_df["Document"]
-            rel_sent_div=[  html.Div(className="number-card-1",children=[
-                                html.Div(className="number-back",children=[
-                                    html.Div(className="number-dis",children=[rel_sent_no]),
-                                    html.Div(className="number-label",children=[html.Div("RELEVANT SENTENCES")]),
-                                ]),html.Div("|",className="saperator"),
-                                html.Div(className="number-back",children=[
-                                    html.Div(className="number-dis2",children=[sent_no]),
-                                    html.Div(className="number-label2",children=[html.Div("TOTAL SENTENCES")]),  
-                                ])
-                            ]),
-                            html.Div(className="number-card-2",children=[
-                                html.Div(className="number-back",children=[
-                                    html.Div(className="number-dis",children=[rel_article_no]),
-                                    html.Div(className="number-label",children=[html.Div("RELEVANT DOCUMENTS")]),                 
-                                ]),html.Div("|",className="saperator"),
-                                html.Div(className="number-back",children=[
-                                    html.Div(className="number-dis2",children=[article_no]),
-                                    html.Div(className="number-label2",children=[html.Div("TOTAL DOCUMENTS")]),    
-                                ])
-                            ])
-                        ]
-            if len(result)>0:
-                return [dash_table.DataTable(
-                        id="ranking_table",
-                        sort_action='native',
-                        sort_mode='multi',
-                        filter_action="native",
-                        style_header={"fontWeight": "bold", "color": "inherit","border-bottom":"1px dashed"},
-                        style_as_list_view=True,
-                        fill_width=True,
-                        page_size=10,
-                        style_cell_conditional=[
-                            {"if": {"column_id": "Sentence"}, 'width': '500px',"maxWidth":'500px'},
-                            {"if": {"column_id": "Title"}, "padding-left":"15px","maxWidth":'300px'},
-                            {'if': {'column_id': 'Author'},'maxWidth': '150px'},
-                            {'if': {'row_index': 'odd'},"backgroundColor":"#49494966"}
-                        ],
-                        style_cell={
-                            "backgroundColor": "transparent",
-                            "fontFamily": "Open Sans",
-                            "padding": "0 0.2rem",
-                            "color": "#f4e9dc",
-                            "border": "none",
-                            'overflow': 'hidden',
-                            'textOverflow': 'ellipsis',
-                            'width': '55px',
-                            'minWidth': '55px',
-                            'maxWidth': '200px',
-                            "padding-left":"10px",
-                            "textAlign": "left"
-                        },
-                        css=[
-                            {"selector": "tr:hover td", "rule": "color: #e85e56 !important;cursor:pointer;height:10px;"},
-                            {"selector": "td:hover", "rule": "border-bottom: dashed 0px !important;"},
-                            {"selector": ".dash-spreadsheet-container table", 
-                                "rule": '--text-color: #e85e56 !important'},
-                            {"selector":".previous-next-container","rule":"float: left;"},
-                            {"selector": "tr", "rule": "background-color: transparent;"},
-                            {"selector": ".current-page", "rule": "background-color: transparent;"},
-                            {"selector":".current-page::placeholder","rule":"color:#e85e56;"},
-                            {"selector": ".column-header--sort","rule":"color: #e85e56; padding-right:3px;"}
-                        ],
-                        style_data_conditional=[
-                        {"if": {"state": "active"},  # 'active' | 'selected'
-                            "border": "0px solid"}]+
-                        data_bars(result, 'Score'),
-                        data=result,
-                        columns=[{"id": c, "name": c} for c in result_df.columns],
-                        selected_rows=[],
-                    ),result,rel_sent_div]
+            ctx = dash.callback_context
+            if not ctx.triggered:
+                button_id = '' # default
            else:
-                return [html.Div("No result"),"",""]
+                button_id = ctx.triggered[0]['prop_id'].split('.')[0]
+
+
+            if ((button_id == "r_table_paras")|(button_id == "n_pp")): # no need to search index again, just change layout
+                result = mo_result[0]
+                tooltip = mo_result[1]
+                if len(result)>0:
+                    result_df = pd.DataFrame.from_records(result)
+                    columns_d = [{"id": "id", "name": "id"}]
+                    columns_d.append({"id": "sentence", "name": "sentence"})
+                    for c in result_df.columns:
+                        if ((c != "id")&(c != "sentence")&(c in cols)):
+                            columns_d.append({"id": c, "name": c})
+
+                    return [build_ranking_table(result,columns_d,tooltip,npp),
+                            mo_result,rs,cq]
+                else:
+                    return [html.Div("No result"),mo_result,rs,cq]
+
+
+            else: # need to search the index
+                
+                result, tooltip, rel_sent_no, sent_no, rel_article_no, article_no=preprocess_corpus.search_corpus(corpus_ind, added, y_from, y_to)
+                
+                result_df = pd.DataFrame.from_records(result)
+                del result_df["document"]
+                
+
+                rel_sent_div=[  html.Div(className="number-card-1",children=[
+                                    html.Div(className="number-back",children=[
+                                        html.Div(className="number-dis",children=[rel_sent_no]),
+                                        html.Div(className="number-label",children=[html.Div("RELEVANT SENTENCES")]),
+                                    ]),html.Div("|",className="saperator"),
+                                    html.Div(className="number-back",children=[
+                                        html.Div(className="number-dis2",children=[sent_no]),
+                                        html.Div(className="number-label2",children=[html.Div("TOTAL SENTENCES")]),  
+                                    ])
+                                ]),
+                                html.Div(className="number-card-2",children=[
+                                    html.Div(className="number-back",children=[
+                                        html.Div(className="number-dis",children=[rel_article_no]),
+                                        html.Div(className="number-label",children=[html.Div("RELEVANT DOCUMENTS")]),                 
+                                    ]),html.Div("|",className="saperator"),
+                                    html.Div(className="number-back",children=[
+                                        html.Div(className="number-dis2",children=[article_no]),
+                                        html.Div(className="number-label2",children=[html.Div("TOTAL DOCUMENTS")]),    
+                                    ])
+                                ])
+                            ]
+                if len(result)>0:
+                    columns_d = [{"id": "id", "name": "id"}]
+                    columns_d.append({"id": "sentence", "name": "sentence"})
+                    for c in result_df.columns:
+                        if ((c != "id")&(c != "sentence")&(c in cols)):
+                            columns_d.append({"id": c, "name": c})       
+                    
+                    return [build_ranking_table(result,columns_d,tooltip,npp), [result,tooltip], rel_sent_div," | ".join(added)]
+                else:
+                    return [html.Div("No result"),"",""," | ".join(added)]
        else:
-            return [html.Div("Please type in the base term in the left pane"),"",""]
+            return [html.Div("Please type in the base term in the left pane"),"","",""]
    else:
-        return [html.Div("Start by selecting a corpus"),"",""]
+        return [html.Div("Start by selecting a corpus"),"","",""]
+
+def build_ranking_table(result,columns_d,tooltip,npp):
+    return dash_table.DataTable(
+                            id="ranking_table",
+                            sort_action='native',
+                            sort_mode='multi',
+                            filter_action="native",
+                            style_header={"fontWeight": "bold", "color": "inherit","border-bottom":"1px dashed"},
+                            style_as_list_view=True,
+                            fill_width=True,
+                            page_size=npp,
+                            style_cell_conditional=[
+                                {"if": {"column_id": "sentence"}, 'width': '500px',"maxWidth":'500px'},
+                                {"if": {"column_id": "title"}, "padding-left":"15px","maxWidth":'300px'},
+                                {'if': {'column_id': 'author'},'maxWidth': '150px'},
+                                {'if': {'row_index': 'odd'},"backgroundColor":"#49494966"}
+                            ],
+                            style_cell={
+                                "backgroundColor": "transparent",
+                                "fontFamily": "Open Sans",
+                                "padding": "0 0.2rem",
+                                "color": "#f4e9dc",
+                                "border": "none",
+                                'overflow': 'hidden',
+                                'textOverflow': 'ellipsis',
+                                'width': '55px',
+                                'minWidth': '55px',
+                                'maxWidth': '200px',
+                                "padding-left":"10px",
+                                "textAlign": "left"
+                            },
+                            
+                            css=[
+                                #{"selector": ".dash-cell.focused","rule": "background-color: #f4e9dc !important; border:none;"},
+                                {"selector": "table", "rule": "--accent: #e85e56;"},
+                                {"selector": "tr:hover td", "rule": "color: #e85e56 !important; background-color:transparent !important; cursor:pointer;height:10px;"},
+                                {"selector": "td:hover", "rule": "border-bottom: dashed 0px !important;"},
+                                {"selector": ".dash-spreadsheet-container table", 
+                                    "rule": '--text-color: #e85e56 !important'},
+                                {"selector":".previous-next-container","rule":"float: left;"},
+                                {"selector": "tr", "rule": "background-color: transparent;"},
+                                {"selector": ".current-page", "rule": "background-color: transparent;"},
+                                {"selector":".current-page::placeholder","rule":"color:#e85e56;"},
+                                {"selector": ".column-header--sort","rule":"color: #e85e56; padding-right:3px;"}
+                            ],
+                            style_data_conditional=[
+                            {"if": {"state": "active"},  # 'active' | 'selected'
+                                "border": "0px solid"}]+
+                            data_bars(result, 'score'),
+                            data=result,
+                            columns=columns_d,
+                            tooltip_data=tooltip,
+                            tooltip_delay=1000, #1s
+                            tooltip_duration=None,
+                            selected_rows=[]
+                        )

 def data_bars(df, column):
    Scores=[]
@@ -702,13 +777,11 @@ def data_bars(df, column):
    [State('memory-output', 'data')])
 def update_graphs(active_cell,data):
    if active_cell:
-        for i in data:
+        for i in data[0]:
            if i['id'] == active_cell['row_id']:
-                return generate_modal(i['Document'])
+                return generate_modal(i['document'])

-    # === Even with no else will give error, but it will close the pop-up automatically ===
-    # else:
-    #     return [[]]
+    raise PreventUpdate 

 # term selection in the sentence pop up window
@app.callback(
@@ -821,6 +894,7 @@ def generate_modal(text=""):
               State("added-terms-global", "children")]
            )
 def add_from_pop(n_clicks,values,added_value_global):
+
    empty_list=[]
    new_add=[]

@@ -845,7 +919,10 @@ def add_from_pop(n_clicks,values,added_value_global):
            if not phrase in added_value_global:
                new_add.append(phrase)

-    return [empty_list,new_add]
+    if n_clicks>0:
+        return [empty_list,new_add]
+    else:
+        raise PreventUpdate


 # function for checking sentence frequency
@@ -1552,11 +1629,11 @@ def build_big_graph(group_sf_dict,doc_num_year,year_from,year_to):
                            ),
                            "autosize":False,
                            "colorway": color_used,#px.colors.qualitative.Prism,
-                            "margin":dict(l=15,r=15,b=10,t=40,pad=4),
+                            "margin":dict(l=55,r=15,b=40,t=40,pad=4),
                            "legend":dict(font=dict(color="#f4e9dc")),
                            "template":"plotly_dark",
                            "width":1100,
-                            "height":290,
+                            "height":330,
                            "legend":dict(yanchor="top",y=0.99,xanchor="left",x=0.01),
                        },
                    }
@@ -1930,7 +2007,7 @@ def update_corpus(data,options):
 )
 def render_tab_content(tab_switch, corpus_name):
    if tab_switch == "tab1":
-        return [build_tab_1()] 
+        return [build_tab_1(corpus_name)] 
    if tab_switch == "tab22":
        return [build_tab_group()]
    if tab_switch == "tab3":
--- a/assets/custom-styles.css
+++ b/assets/custom-styles.css
@@ -27,8 +27,6 @@ label{
 #big-app-container {
  width: 100%;
  display: flex;
-  /*flex-direction: column;*/
-  /*align-items: center;*/
  padding: 0 4rem;
 }

@@ -471,7 +469,7 @@ lable {
 }

 #Overview-tab.custom-tab,
-#Specs-tab.custom-tab, #Graph-tab.custom-tab,
+#Specs-tab.custom-tab, /*#Graph-tab.custom-tab,*/
 #Group-tab.custom-tab, #Control-chart-tab.custom-tab{
  margin-right: 3px;
 }
@@ -866,7 +864,7 @@ lable {

 .output-datatable {
  margin: 1.3rem 0rem 0rem 3rem;
-  line-height:2.4;
+  line-height:1.5;
  align-self: flex-end;
  width: 94%;
  font-variant: all-petite-caps;
@@ -881,12 +879,13 @@ lable {

 .react-numeric-input, .react-numeric-input input {
  width: 100% !important;
+  height: 80%;
 }

 .react-numeric-input input {
-  background-color: #f4e9dc !important;
+  background-color: transparent !important;
  border-color: #e85e56 !important;
-  color: #e85e56 !important;
+  color: inherit !important;
 }

 .react-numeric-input b:nth-child(2) i {
@@ -897,6 +896,18 @@ lable {
  border-color: #e85e56 transparent transparent !important;
 }

+.sent-pp div .react-numeric-input input {
+  border-color: #404040 !important;
+  font-size: small !important;
+}
+
+.sent-pp div .react-numeric-input b:nth-child(2) i {
+  border-color: transparent transparent #404040 !important;
+}
+
+.sent-pp div .react-numeric-input b:nth-child(3) i {
+  border-color: #404040 transparent transparent !important;
+}
 /* Input
 ----------------------------------------------*/

@@ -1056,11 +1067,10 @@ input[type="checkbox"]:checked{
 }
 .side-side-side2{
  display: block;
-  height: 10%;
-  min-height: 145px;
+  height: 25vh;
  width: 90%;
  margin-top: 10px;
-  margin-bottom: 10px;
+  margin-bottom: 16px;
  border: solid #e85e56;
  border-width: 1px;
  border-radius: 4px;
@@ -1126,34 +1136,91 @@ div#added-terms::-webkit-scrollbar-thumb {
    background-color: #e85e56;
    border-radius:10px;
 }
+.dash-tooltip {
+  position: fixed !important;
+  left:40% !important;
+  top:50% !important;
+  border: solid 2px #e85e56 !important;
+  border-radius: 8px;
+  background-color: rgba(31,29,36,0.95) !important;
+}
+.dash-tooltip:before{
+  border: none !important;
+  background-color: transparent !important;
+}
+.dash-tooltip:after{
+  margin-bottom: 2px;
+  border: dashed 0px transparent !important;
+  border-bottom-color: rgba(232, 94, 86,0.8) !important;
+  background-color: transparent !important;
+}

-
+.dash-table-tooltip{
+  max-width: 600px !important;
+  background-color: transparent !important;
+  border: none !important;
+  border-radius: 5px;
+  color:#e85e56;
+  line-height: 25px;
+  font-size: large;
+}

 div#ranking-table::-webkit-scrollbar {
-      width:3px;
-      height: 5px;
+      width:15px;
+      height: 15px;
      background:transparent;
+      margin-left: 3px;
  }
 div#ranking-table::-webkit-scrollbar-track {
-    background:none;
-  }
-
-div#ranking-table:hover ::-webkit-scrollbar-track {
-    background:#f4e9dc;
-    border: 1px solid #1e1c24;
+    background:transparent;
+    border: 0px solid #1e1c24;
    border-radius:10px;
  }

-div#ranking-table:-webkit-scrollbar-thumb {
-    background: none;
+div#ranking-table::-webkit-scrollbar-corner {
+    background: transparent;
 }

-div#ranking-table:hover::-webkit-scrollbar-thumb {
+div#ranking-table::-webkit-scrollbar-thumb {
+    border: 5px solid #1e1c24;
+    background-color: #e85e56;
+    border-radius:10px;
+}
+
+
+
+div#make-groups{
+    overflow-y: scroll;
+    max-height: 25vh;
+    padding-bottom: 15vh;
+}
+div#make-groups::-webkit-scrollbar {
+      width:5px;
+      height: 0px;
+      background:transparent;
+      margin-left: 3px;
+  }
+div#make-groups::-webkit-scrollbar-track {
+    background:transparent;
+    border: 0px solid #1e1c24;
+    border-radius:10px;
+  }
+
+
+div#make-groups::-webkit-scrollbar-thumb {
    border: 0px solid #1e1c24;
    background-color: #e85e56;
    border-radius:10px;
 }

+
+
+
+#ranking_table{
+  padding-right: 10px;
+}
+
+
 div#display-cards::-webkit-scrollbar {
      width:3px;
      height: 5px;
@@ -1420,14 +1487,66 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {

 }

+
+
+
+.table-dropdown > .Select-control > span {
+    
+    background: transparent !important;
+    border-color: #e85e56 !important;
+}
+.table-dropdown div {
+    background-color: #1f1d24 !important;
+    border-color: #404040 !important;
+}
+
+.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value {
+  background-color: #494949;
+  border-color: #494949;
+  color: #f4e9dc;
+
+  border-radius: 5px;
+}
+.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-label {
+  color: #f4e9dc;
+  font-size: x-small;
+}
+
+.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-icon {
+  border-right: 0px dashed #e85e56;  
+}
+
+.table-dropdown > .Select-control > .Select-multi-value-wrapper > .Select-value > .Select-value-icon:hover {
+  color: #e85e56; 
+  box-shadow: none;
+}
+
+.table-dropdown{
+  width: 155%;
+}
+
+.table-dropdown > .Select-control {
+  margin-left: 0% !important;
+}
+
+.table-dropdown > .Select-menu-outer div {
+  font-size: small !important;
+}
+
+
+.dash-table-container{
+  width:100%;
+}
 #display-cards{
  overflow-x: scroll;
  display: flex;
  margin-left: 2px;
+  margin-bottom: 10px;
 }
 #ranking-table{
-  overflow-x: scroll;
+  overflow: scroll;
  display: flex;
+  height: 74vh;
 }
 #display-group-graph{
  overflow-x: scroll;
@@ -1451,16 +1570,16 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {

 .Select-menu-outer ::-webkit-scrollbar {
      width:6px;
-      background:#f4e9dc;
+      background:#1e1c24;
 }
 .Select-menu-outer ::-webkit-scrollbar-thumb {
-    border: 1px solid #e85e56;
+    border: 0px solid #e85e56;
    background-color: #e85e56;
    border-radius:10px;
  }
-.Select-menu-outer::-webkit-scrollbar-track {
-    background:#f4e9dc;
-    border: 1px solid #f4e9dc;
+.Select-menu-outer ::-webkit-scrollbar-track {
+    background:#1e1c24;
+    border: 0px solid #f4e9dc;
    border-radius:10px;  
 }

@@ -1610,10 +1729,9 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
 #app-content {
  background: inherit;
  padding: 0;
-  width: 101%;
  /*max-width: 95%;*/
  height: 97%;
-  overflow-y: scroll;
+  /*overflow-y: scroll;*/
 }

 #settings-menu {
@@ -1656,7 +1774,8 @@ div#search-top-result:hover::-webkit-scrollbar-thumb {
 }

 .figure-side-by-side{
-
+  margin-top: 25px;
+  margin-bottom: 10px;

 }
 .cmSQpo{
--- a/BIN
+++ b/BIN
--- a/topic_model/.DS_Store
+++ b/topic_model/.DS_Store
--- a/topic_model/generate_models_fromapp.py
+++ b/topic_model/generate_models_fromapp.py
@@ -54,7 +54,7 @@ def build_model(df_full,corpus_name,content_col):

    # NLTK Stop words
    from nltk.corpus import stopwords
-    # nltk.download('stopwords')
+    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

--- a/whoosh_search/.DS_Store
+++ b/whoosh_search/.DS_Store
--- a/whoosh_search/preprocess_corpus.py
+++ b/whoosh_search/preprocess_corpus.py
@@ -116,6 +116,7 @@ def delete_corpus_from_app(index_dir):
 def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year_col,author_col,add_cols): 

 	doc_no_year={}
+	sent_no_year={}
 	doc_len_dict={}
 	
 	path = os.path.join("./whoosh_search", index_dir)
@@ -169,7 +170,12 @@ def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year
 			doc_len_dict[doc_len]+=1
 		else:
 			doc_len_dict[doc_len]=1
-			
+		
+		if year in sent_no_year:
+			sent_no_year[year]+=sen_no
+		else:
+			sent_no_year[year]=sen_no
+
 		if year in doc_no_year:
 			doc_no_year[year]+=1
 		else:
@@ -192,6 +198,12 @@ def add_new_corpus_from_app(index_dir,corpus_dict,id_col,text_col,title_col,year
 		f3.write(text)
 	f3.close()

+	f4 = open(path+"sent_num", "w")
+	for year in sent_no_year:
+		text=year+" "+str(sent_no_year[year])+"\n"
+		f4.write(text)
+	f4.close()
+
 	print("[ Indexing Finished. In total "+str(line_no)+" documents. ]")
 	return True

@@ -261,8 +273,12 @@ def filter_corpus(corpus_ind_dir, query_list,year_from, year_to):


 # search by query
-def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the query term in the list will be connected by OR
+def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=1000): #the query term in the list will be connected by OR
 	
+	import time
+
+	start = time.time()
+
 	ix = index.open_dir(corpus_ind_dir) #load index
 	
 	with ix.searcher() as searcher:
@@ -286,11 +302,13 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
 		q2 = query.Or(term_list_Y)

 		q_f = query.And([q1,q2]) 
-
-
+		
+		# search the index
 		results = searcher.search(q_f,limit=None)
 		
+		
 		result_list=[]
+		full_sents =[] 
 		relevant_article_ids=[]
 		i=0

@@ -304,11 +322,11 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
 				row_data = {}
 				
 				row_data["id"] = r["id"]
-				row_data["Year"] = r["year"]
-				row_data["Sentence"] = r["content"].lower()#snipet
-				row_data["Title"] = r["title"].lower()
-				row_data["Author"] = r["author"]
-				row_data["Document"] = r["content"].lower()
+				row_data["year"] = r["year"]
+				row_data["sentence"] = r["content"].lower()#snipet
+				row_data["title"] = r["title"].lower()
+				row_data["author"] = r["author"]
+				row_data["document"] = r["content"].lower()

 				for key in r:
 					if key in ["content", "id", "title", "year", "author"]:
@@ -316,9 +334,12 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the
 					else:
 						row_data[key]=r[key]

-				row_data["Score"] = round(r.score,3)
+				row_data["score"] = round(r.score,3)

 				result_list.append(row_data)
+				full_sents.append({"sentence":row_data["document"]})
+			else:
+				break

 		with open(corpus_ind_dir+"/doc_num") as f:
 			total_doc_no = 0
@@ -332,7 +353,19 @@ def search_corpus(corpus_ind_dir, query_list,year_from, year_to,top_n=100): #the

 		f.close()

-		return [result_list, len(results), len(searcher.search(q2,limit=None)), len(relevant_article_ids),total_doc_no]
+		with open(corpus_ind_dir+"/sent_num") as f:
+			total_sent_no = 0
+			lines = f.readlines()
+
+			for line in lines:
+				sent_num=line.strip().split()
+				if ((int(sent_num[0])>=year_from) & (int(sent_num[0])<=year_to)):
+						total_sent_no+=int(sent_num[1])
+
+		f.close()
+		
+		print("Results returned:", time.time() - start)
+		return [result_list, full_sents, len(results), total_sent_no, len(relevant_article_ids),total_doc_no]

 def check_sf(corpus_ind_dir,query_list):
 	query_l=[]
@@ -580,3 +613,9 @@ def get_fieldnames(corpus_ind_dir):
 	fileds.remove('id')
 	fileds.remove('title')
 	return fileds
+
+def get_table_fieldnames(corpus_ind_dir):
+	fileds=index.open_dir(corpus_ind_dir).schema.stored_names()
+	fileds.remove('content')
+	fileds.remove('id')
+	return fileds
--- a/whoosh_search/sample_data_index/sent_num
+++ b/whoosh_search/sample_data_index/sent_num
@@ -0,0 +1,7 @@
+2012 49
+2013 58
+2014 49
+2015 46
+2016 38
+2017 72
+2018 53