This commit is contained in:
bwt09
2022-06-06 23:43:50 -07:00
parent 94c6088c68
commit 1f2654572a
3 changed files with 9 additions and 52 deletions

View File

@@ -135,8 +135,8 @@ class EntityTupleSearcher:
if any([word in stopwords for word in pred_ent.split()]):
return
# filter entity with less than 4 characters
if len(pred_ent.replace(' ', '')) <= 3:
# filter entity with less than 3 characters
if len(pred_ent.replace(' ', '')) <= 2:
return
# filter entity with single-character words

View File

@@ -72,11 +72,8 @@ class KnowledgeHarvester:
self._weighted_prompts,
key=lambda t: t[1], reverse=True)[:self._max_n_prompts]
for prompt, weight in self._weighted_prompts:
print(f'{weight:.4f}', prompt)
norm_weights = softmax([weight for _, weight in self._weighted_prompts])
norm_weights[norm_weights < 0.02] = 0.
norm_weights[norm_weights < 0.05] = 0.
norm_weights /= norm_weights.sum()
for i, norm_weight in enumerate(norm_weights):

View File

@@ -339,45 +339,6 @@
"<ENT0> is a dish made from <ENT1> that has been fried in a wok or a pan ."
]
},
"release_time": {
"init_prompts": [
"<ENT0> is released in <ENT1> ."
],
"seed_ent_tuples": [
[
"iPhone",
"2007"
],
[
"Windows 7",
"2009"
],
[
"Volkswagen Golf",
"1974"
],
[
"word2vec",
"2013"
],
[
"Avatar",
"2009"
]
],
"prompts": [
"<ENT0> was released in <ENT1> .",
"In <ENT1>, <ENT0> was released .",
"Google released its <ENT0> algorithm in <ENT1> .",
"Google's <ENT0> algorithm is released in <ENT1> .",
"<ENT0> is a vector representation of words that was released in <ENT1> .",
"<ENT0> is a tool that was released in <ENT1> that creates word embeddings .",
"<ENT0> is a tool that creates word embeddings, that was released in <ENT1> .",
"In <ENT1>, <ENT0> was released as a toolkit for vector representation of words .",
"The <ENT0> algorithm is a neural network algorithm that was released in <ENT1> .",
"<ENT0> is a toolkit that was released in <ENT1> that creates word embeddings, which are vector representations of words that can be used ."
]
},
"processed_from": {
"init_prompts": [
"<ENT0> is the source of <ENT1> ."
@@ -540,7 +501,6 @@
"<ENT0> is an international <ENT1> and gas company .",
"<ENT0> is a <ENT1> company that manufactures and sells vehicles .",
"<ENT0> sells <ENT1>s through its online store and retail locations .",
"<ENT0> is a company that sells graphics processing units (<ENT1>s) .",
"<ENT0> is a popular fast food chain that is known for selling <ENT1>s .",
"<ENT0> is ajapanese<ENT1> manufacturing company that is headquartered in yokohama, japan .",
"<ENT0> is a fast food company that specializes in <ENT1>s, fried chicken, and soft drinks .",
@@ -549,7 +509,7 @@
"<ENT0> describes themselves as \"the world\u2019s largest information technology company by revenue,\" and they sell many products, including <ENT1>s ."
]
},
"featured thing": {
"featured_thing": {
"init_prompts": [
"<ENT0> is a very <ENT1> <ENT2> ."
],
@@ -593,7 +553,7 @@
"Although <ENT0> is not the world's <ENT1>est <ENT2>, it is still very wealthy ."
]
},
"need sth to do sth": {
"need_sth_to_do_sth": {
"init_prompts": [
"<ENT0> needs <ENT1> to <ENT2> ."
],
@@ -637,7 +597,7 @@
"It is important for <ENT0> to be able to <ENT2> with each other through <ENT1> ."
]
},
"more than": {
"more_than": {
"init_prompts": [
"<ENT0> is more <ENT1> than <ENT2> ."
],
@@ -681,7 +641,7 @@
"Some people believe that <ENT0> is more <ENT1> than <ENT2> because it is more active and engaging ."
]
},
"can but not good": {
"can_but_not_good": {
"init_prompts": [
"<ENT0> can <ENT1> but not good at ."
],
@@ -722,7 +682,7 @@
"While <ENT0>s are able to <ENT1>, they are not instinctively good at it and may need some help or encouragement to do so ."
]
},
"worth celebrating": {
"worth_celebrating": {
"init_prompts": [
"It\u2019s worth celebrating for a <ENT0> to <ENT1> ."
],
@@ -761,7 +721,7 @@
"It is seen as significant accomplishment for <ENT0> when they <ENT1> ."
]
},
"potential risk": {
"potential_risk": {
"init_prompts": [
"A potential risk of <ENT0> is <ENT1> ."
],