mirror of
https://github.com/NVIDIA/nv-ingest.git
synced 2025-01-05 18:58:13 +03:00
Add table and chart extraction tasks from extract parameters (#288)
This commit is contained in:
@@ -220,8 +220,6 @@ from nv_ingest_client.client import NvIngestClient
|
||||
from nv_ingest_client.primitives import JobSpec
|
||||
from nv_ingest_client.primitives.tasks import ExtractTask
|
||||
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
||||
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
||||
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
||||
|
||||
logger = logging.getLogger("nv_ingest_client")
|
||||
|
||||
@@ -254,12 +252,7 @@ extract_task = ExtractTask(
|
||||
extract_tables=True
|
||||
)
|
||||
|
||||
table_data_extract = TableExtractionTask()
|
||||
chart_data_extract = ChartExtractionTask()
|
||||
|
||||
job_spec.add_task(extract_task)
|
||||
job_spec.add_task(table_data_extract)
|
||||
job_spec.add_task(chart_data_extract)
|
||||
|
||||
# Create the client and inform it about the JobSpec we want to process.
|
||||
client = NvIngestClient(
|
||||
|
||||
@@ -73,8 +73,6 @@
|
||||
"from nv_ingest_client.primitives.tasks import DedupTask\n",
|
||||
"from nv_ingest_client.primitives.tasks import EmbedTask\n",
|
||||
"from nv_ingest_client.primitives.tasks import ExtractTask\n",
|
||||
"from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask\n",
|
||||
"from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask\n",
|
||||
"from nv_ingest_client.primitives.tasks import FilterTask\n",
|
||||
"from nv_ingest_client.primitives.tasks import SplitTask\n",
|
||||
"from nv_ingest_client.primitives.tasks import StoreTask, StoreEmbedTask\n",
|
||||
@@ -198,17 +196,12 @@
|
||||
" extract_tables_method=\"yolox\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"table_data_extract = TableExtractionTask()\n",
|
||||
"chart_data_extract = ChartExtractionTask()\n",
|
||||
"\n",
|
||||
"dedup_task = DedupTask(\n",
|
||||
" content_type=\"image\",\n",
|
||||
" filter=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"job_spec.add_task(extract_task)\n",
|
||||
"job_spec.add_task(table_data_extract)\n",
|
||||
"job_spec.add_task(chart_data_extract)\n",
|
||||
"job_spec.add_task(dedup_task)"
|
||||
]
|
||||
},
|
||||
@@ -589,9 +582,6 @@
|
||||
" extract_tables_method=\"yolox\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"table_data_extract = TableExtractionTask()\n",
|
||||
"chart_data_extract = ChartExtractionTask()\n",
|
||||
"\n",
|
||||
"dedup_task = DedupTask(\n",
|
||||
" content_type=\"image\",\n",
|
||||
" filter=True,\n",
|
||||
@@ -647,8 +637,6 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"job_spec.add_task(extract_task)\n",
|
||||
"job_spec.add_task(table_data_extract)\n",
|
||||
"job_spec.add_task(chart_data_extract)\n",
|
||||
"job_spec.add_task(dedup_task)\n",
|
||||
"job_spec.add_task(filter_task)\n",
|
||||
"job_spec.add_task(split_task)\n",
|
||||
@@ -706,7 +694,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -27,8 +27,6 @@ from nv_ingest_client.primitives.tasks import SplitTask
|
||||
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
||||
from nv_ingest_client.primitives.tasks import StoreTask
|
||||
from nv_ingest_client.primitives.tasks import VdbUploadTask
|
||||
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
||||
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
||||
from nv_ingest_client.util.util import filter_function_kwargs
|
||||
|
||||
DEFAULT_JOB_QUEUE_ID = "morpheus_task_queue"
|
||||
@@ -373,11 +371,6 @@ class Ingestor:
|
||||
)
|
||||
self._job_specs.add_task(extract_task, document_type=document_type)
|
||||
|
||||
if extract_tables is True:
|
||||
self._job_specs.add_task(TableExtractionTask())
|
||||
if extract_charts is True:
|
||||
self._job_specs.add_task(ChartExtractionTask())
|
||||
|
||||
return self
|
||||
|
||||
@ensure_job_specs
|
||||
|
||||
@@ -13,6 +13,9 @@ from typing import Union
|
||||
from uuid import UUID
|
||||
|
||||
from nv_ingest_client.primitives.tasks import Task
|
||||
from nv_ingest_client.primitives.tasks import ExtractTask
|
||||
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
||||
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
||||
from nv_ingest_client.util.dataset import get_dataset_files
|
||||
from nv_ingest_client.util.dataset import get_dataset_statistics
|
||||
|
||||
@@ -162,6 +165,11 @@ class JobSpec:
|
||||
|
||||
self._tasks.append(task)
|
||||
|
||||
if isinstance(task, ExtractTask) and (task._extract_tables is True):
|
||||
self._tasks.append(TableExtractionTask())
|
||||
if isinstance(task, ExtractTask) and (task._extract_charts is True):
|
||||
self._tasks.append(ChartExtractionTask())
|
||||
|
||||
|
||||
class BatchJobSpec:
|
||||
"""
|
||||
|
||||
@@ -144,10 +144,7 @@ import logging, time
|
||||
from nv_ingest_client.client import NvIngestClient
|
||||
from nv_ingest_client.primitives import JobSpec
|
||||
from nv_ingest_client.primitives.tasks import ExtractTask
|
||||
from nv_ingest_client.primitives.tasks import SplitTask
|
||||
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
||||
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
||||
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
||||
|
||||
logger = logging.getLogger("nv_ingest_client")
|
||||
|
||||
@@ -180,12 +177,7 @@ extract_task = ExtractTask(
|
||||
extract_tables=True
|
||||
)
|
||||
|
||||
table_data_extract = TableExtractionTask()
|
||||
chart_data_extract = ChartExtractionTask()
|
||||
|
||||
job_spec.add_task(extract_task)
|
||||
job_spec.add_task(table_data_extract)
|
||||
job_spec.add_task(chart_data_extract)
|
||||
|
||||
# Create the client and inform it about the JobSpec we want to process.
|
||||
client = NvIngestClient(
|
||||
|
||||
Reference in New Issue
Block a user