Add table and chart extraction tasks from extract parameters (#288)

This commit is contained in:
Edward Kim
2024-12-16 18:55:55 -08:00
committed by GitHub
parent 2d6075c6ed
commit 98dccebe5e
5 changed files with 9 additions and 35 deletions

View File

@@ -220,8 +220,6 @@ from nv_ingest_client.client import NvIngestClient
from nv_ingest_client.primitives import JobSpec
from nv_ingest_client.primitives.tasks import ExtractTask
from nv_ingest_client.util.file_processing.extract import extract_file_content
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
logger = logging.getLogger("nv_ingest_client")
@@ -254,12 +252,7 @@ extract_task = ExtractTask(
extract_tables=True
)
table_data_extract = TableExtractionTask()
chart_data_extract = ChartExtractionTask()
job_spec.add_task(extract_task)
job_spec.add_task(table_data_extract)
job_spec.add_task(chart_data_extract)
# Create the client and inform it about the JobSpec we want to process.
client = NvIngestClient(

View File

@@ -73,8 +73,6 @@
"from nv_ingest_client.primitives.tasks import DedupTask\n",
"from nv_ingest_client.primitives.tasks import EmbedTask\n",
"from nv_ingest_client.primitives.tasks import ExtractTask\n",
"from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask\n",
"from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask\n",
"from nv_ingest_client.primitives.tasks import FilterTask\n",
"from nv_ingest_client.primitives.tasks import SplitTask\n",
"from nv_ingest_client.primitives.tasks import StoreTask, StoreEmbedTask\n",
@@ -198,17 +196,12 @@
" extract_tables_method=\"yolox\",\n",
")\n",
"\n",
"table_data_extract = TableExtractionTask()\n",
"chart_data_extract = ChartExtractionTask()\n",
"\n",
"dedup_task = DedupTask(\n",
" content_type=\"image\",\n",
" filter=True,\n",
")\n",
"\n",
"job_spec.add_task(extract_task)\n",
"job_spec.add_task(table_data_extract)\n",
"job_spec.add_task(chart_data_extract)\n",
"job_spec.add_task(dedup_task)"
]
},
@@ -589,9 +582,6 @@
" extract_tables_method=\"yolox\",\n",
")\n",
"\n",
"table_data_extract = TableExtractionTask()\n",
"chart_data_extract = ChartExtractionTask()\n",
"\n",
"dedup_task = DedupTask(\n",
" content_type=\"image\",\n",
" filter=True,\n",
@@ -647,8 +637,6 @@
"\n",
"\n",
"job_spec.add_task(extract_task)\n",
"job_spec.add_task(table_data_extract)\n",
"job_spec.add_task(chart_data_extract)\n",
"job_spec.add_task(dedup_task)\n",
"job_spec.add_task(filter_task)\n",
"job_spec.add_task(split_task)\n",
@@ -706,7 +694,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
"version": "3.10.16"
}
},
"nbformat": 4,

View File

@@ -27,8 +27,6 @@ from nv_ingest_client.primitives.tasks import SplitTask
from nv_ingest_client.primitives.tasks import StoreEmbedTask
from nv_ingest_client.primitives.tasks import StoreTask
from nv_ingest_client.primitives.tasks import VdbUploadTask
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
from nv_ingest_client.util.util import filter_function_kwargs
DEFAULT_JOB_QUEUE_ID = "morpheus_task_queue"
@@ -373,11 +371,6 @@ class Ingestor:
)
self._job_specs.add_task(extract_task, document_type=document_type)
if extract_tables is True:
self._job_specs.add_task(TableExtractionTask())
if extract_charts is True:
self._job_specs.add_task(ChartExtractionTask())
return self
@ensure_job_specs

View File

@@ -13,6 +13,9 @@ from typing import Union
from uuid import UUID
from nv_ingest_client.primitives.tasks import Task
from nv_ingest_client.primitives.tasks import ExtractTask
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
from nv_ingest_client.util.dataset import get_dataset_files
from nv_ingest_client.util.dataset import get_dataset_statistics
@@ -162,6 +165,11 @@ class JobSpec:
self._tasks.append(task)
if isinstance(task, ExtractTask) and (task._extract_tables is True):
self._tasks.append(TableExtractionTask())
if isinstance(task, ExtractTask) and (task._extract_charts is True):
self._tasks.append(ChartExtractionTask())
class BatchJobSpec:
"""

View File

@@ -144,10 +144,7 @@ import logging, time
from nv_ingest_client.client import NvIngestClient
from nv_ingest_client.primitives import JobSpec
from nv_ingest_client.primitives.tasks import ExtractTask
from nv_ingest_client.primitives.tasks import SplitTask
from nv_ingest_client.util.file_processing.extract import extract_file_content
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
logger = logging.getLogger("nv_ingest_client")
@@ -180,12 +177,7 @@ extract_task = ExtractTask(
extract_tables=True
)
table_data_extract = TableExtractionTask()
chart_data_extract = ChartExtractionTask()
job_spec.add_task(extract_task)
job_spec.add_task(table_data_extract)
job_spec.add_task(chart_data_extract)
# Create the client and inform it about the JobSpec we want to process.
client = NvIngestClient(