mirror of
https://github.com/NVIDIA/nv-ingest.git
synced 2025-01-05 18:58:13 +03:00
Initial commit
This commit is contained in:
75
.github/ISSUE_TEMPLATE/bug_report_form.yml
vendored
Normal file
75
.github/ISSUE_TEMPLATE/bug_report_form.yml
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Bug Report
|
||||
description: File a bug report for NV-Ingest
|
||||
title: "[BUG]: "
|
||||
labels: ["bug"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/nv-ingest/blob/main/CODE_OF_CONDUCT.md)
|
||||
You also have searched the [existing open bugs](https://github.com/NVIDIA/nv-ingest/issues?q=is%3Aopen+is%3Aissue+label%3Abug)
|
||||
|
||||
|
||||
- type: input
|
||||
id: version
|
||||
attributes:
|
||||
label: Version
|
||||
description: What version of NVIDIA Ingest are you running?
|
||||
placeholder: "example: 24.08"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: installation-method
|
||||
attributes:
|
||||
label: Which installation method(s) does this occur on?
|
||||
multiple: true
|
||||
options:
|
||||
- Docker
|
||||
- Source
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Describe the bug.
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: XYZ occured, I expected QRS results
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: mvr
|
||||
attributes:
|
||||
label: Minimum reproducible example
|
||||
description: Please supply a [minimum reproducible code example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) here
|
||||
render: shell
|
||||
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please paste relevant error and log output here
|
||||
render: shell
|
||||
|
||||
- type: textarea
|
||||
id: misc
|
||||
attributes:
|
||||
label: Other/Misc.
|
||||
description: Please enter any other helpful information here. Please consider running `print_env.sh` and pasting the results here.
|
||||
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
11
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
# GitHub info on config.yml
|
||||
# https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
|
||||
# Set to 'false' if you only want the templates to be used.
|
||||
blank_issues_enabled: true
|
||||
|
||||
# When using discussions instead of Question issue templates,
|
||||
# link that below to have it show up in the 'Submit Issue' page
|
||||
contact_links:
|
||||
- name: Ask a Question
|
||||
url: https://github.com/nvidia/nv-ingest/discussions
|
||||
about: Please ask any questions here.
|
||||
58
.github/ISSUE_TEMPLATE/documentation_request_correction.yml
vendored
Normal file
58
.github/ISSUE_TEMPLATE/documentation_request_correction.yml
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Documentation - Correction/Update Request
|
||||
description: Request corrections or updates to existing NV-Ingest documentation
|
||||
title: "[DOC]: "
|
||||
labels: ["doc"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to improve our documentation!
|
||||
By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/nv-ingest/blob/main/CODE_OF_CONDUCT.md)
|
||||
You also have searched the [existing open doc issues](https://github.com/NVIDIA/nv-ingest/issues?q=is%3Aopen+is%3Aissue+label%3Adoc)
|
||||
|
||||
- type: dropdown
|
||||
id: criticality
|
||||
attributes:
|
||||
label: How would you describe the priority of this documentation request
|
||||
options:
|
||||
- Currently preventing usage
|
||||
- Significant improvement
|
||||
- Would be nice
|
||||
|
||||
- type: input
|
||||
id: correction_location
|
||||
attributes:
|
||||
label: Please provide a link or source to the relevant docs
|
||||
placeholder: "ex: https://github.com/NVIDIA/nv-ingest/blob/main/README.md"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Describe the problems in the documentation
|
||||
placeholder: The documents say to use foo.func(args) however an AttributeError is thrown
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: correction
|
||||
attributes:
|
||||
label: (Optional) Propose a correction or improvement
|
||||
placeholder: foo.func() was deprecated, replace documentation with foo.new_func()
|
||||
53
.github/ISSUE_TEMPLATE/documentation_request_new.yml
vendored
Normal file
53
.github/ISSUE_TEMPLATE/documentation_request_new.yml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Documentation - New Documentation Request
|
||||
description: Request additions to NV-Ingest documentation
|
||||
title: "[DOC]: "
|
||||
labels: ["doc"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to improve our documentation!
|
||||
By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/nv-ingest/blob/main/CODE_OF_CONDUCT.md)
|
||||
You also have searched the [existing open doc issues](https://github.com/NVIDIA/nv-ingest/issues?q=is%3Aopen+is%3Aissue+label%3Adoc)
|
||||
|
||||
- type: dropdown
|
||||
id: criticality
|
||||
attributes:
|
||||
label: How would you describe the priority of this documentation request
|
||||
options:
|
||||
- Currently preventing usage
|
||||
- Significant improvement
|
||||
- Would be nice
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Describe the future/missing documentation
|
||||
placeholder: A code snippet mentions function foo(args) but I cannot find any documentation on it.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: search_locs
|
||||
attributes:
|
||||
label: Where have you looked?
|
||||
placeholder: |
|
||||
https://github.com/NVIDIA/nv-ingest/blob/main/README.md
|
||||
77
.github/ISSUE_TEMPLATE/feature_request_form.yml
vendored
Normal file
77
.github/ISSUE_TEMPLATE/feature_request_form.yml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Feature Request Form
|
||||
description: Request functionality or changes to existing functionality for NV-Ingest
|
||||
title: "[FEA]: "
|
||||
labels: ["feature request"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this feature request!
|
||||
By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/nv-ingest/blob/main/CODE_OF_CONDUCT.md)
|
||||
You also have searched the [existing open feature requests](https://github.com/NVIDIA/nv-ingest/issues?q=is%3Aopen+is%3Aissue+label%3Afeature+request)
|
||||
|
||||
- type: dropdown
|
||||
id: new_or_improvement
|
||||
attributes:
|
||||
label: Is this a new feature, an improvement, or a change to existing functionality?
|
||||
options:
|
||||
- New Feature
|
||||
- Improvement
|
||||
- Change
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: criticality
|
||||
attributes:
|
||||
label: How would you describe the priority of this feature request
|
||||
options:
|
||||
- Currently preventing usage
|
||||
- Significant improvement
|
||||
- Would be nice
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Please provide a clear description of problem this feature solves
|
||||
description: Real usage examples are especially helpful, non-code.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: solution
|
||||
attributes:
|
||||
label: Describe the feature, and optionally a solution or implementation and any alternatives
|
||||
description: Please describe the functionality you would like added.
|
||||
placeholder: >
|
||||
A new function that takes in the information in this form, and triages the issue
|
||||
|
||||
def feature_request(form_info):
|
||||
parse(form_info)
|
||||
return triage_outcome
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: misc
|
||||
attributes:
|
||||
label: Additional context
|
||||
description: Add any other context, code examples, or references to existing implementations about the feature request here. If applicable, please list the modules affected.
|
||||
9
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
9
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
## Description
|
||||
<!-- Provide a standalone description of changes in this PR. -->
|
||||
<!-- Reference any issues closed by this PR with "closes #1234". -->
|
||||
<!-- Note: The pull request title will be included in the CHANGELOG. -->
|
||||
|
||||
## Checklist
|
||||
- [ ] I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/nv-ingest/blob/main/CONTRIBUTING.md).
|
||||
- [ ] New or existing tests cover these changes.
|
||||
- [ ] The documentation is up to date with these changes.
|
||||
13
CHANGELOG.md
Normal file
13
CHANGELOG.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# NVIDIA Ingest 24.08.0
|
||||
|
||||
## New Features
|
||||
|
||||
- ...
|
||||
|
||||
## Improvements
|
||||
|
||||
- ...
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- ...
|
||||
45
CITATION.md
Normal file
45
CITATION.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Citation Guide
|
||||
|
||||
## To Cite NVIDIA Ingest
|
||||
If you use NVIDIA Ingest in a publication, please use citations in the following format (BibTeX entry for LaTeX):
|
||||
```tex
|
||||
@Manual{,
|
||||
title = {NVIDIA Ingest: An accelerated pipeline for document ingestion},
|
||||
author = {NVIDIA Ingest Development Team},
|
||||
year = {2024},
|
||||
url = {https://github.com/NVIDIA/nv-ingest},
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Sample Citations:
|
||||
|
||||
Using [RAPIDS](rapids.ai) citations for reference.
|
||||
|
||||
### Bringing UMAP Closer to the Speed of Light <br> with GPU Acceleration
|
||||
```tex
|
||||
@misc{
|
||||
nolet2020bringing,
|
||||
title={Bringing UMAP Closer to the Speed of Light with GPU Acceleration},
|
||||
author={Corey J. Nolet, Victor Lafargue, Edward Raff, Thejaswi Nanditale, Tim Oates, John Zedlewski, and Joshua Patterson},
|
||||
year={2020},
|
||||
eprint={2008.00325},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.LG}
|
||||
}
|
||||
```
|
||||
|
||||
### Machine Learning in Python: <br> Main developments and technology trends in data science, machine learning, and artificial intelligence
|
||||
```tex
|
||||
@article{
|
||||
raschka2020machine,
|
||||
title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence},
|
||||
author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey},
|
||||
journal={Information},
|
||||
volume={11},
|
||||
number={4},
|
||||
pages={193},
|
||||
year={2020},
|
||||
publisher={Multidisciplinary Digital Publishing Institute}
|
||||
}
|
||||
```
|
||||
84
CODE_OF_CONDUCT.md
Normal file
84
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Overview
|
||||
|
||||
Define the code of conduct followed and enforced for NVIDIA Ingest.
|
||||
|
||||
### Intended audience
|
||||
|
||||
Community | Developers | Project Leads
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as
|
||||
contributors and maintainers pledge to making participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
||||
level of experience, education, socio-economic status, nationality, personal
|
||||
appearance, race, religion, or sexual identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment
|
||||
include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable
|
||||
behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by contacting GitHub_Conduct@nvidia.com. All complaints will be reviewed and
|
||||
investigated and will result in a response that is deemed necessary and appropriate
|
||||
to the circumstances. The project team is obligated to maintain confidentiality with
|
||||
regard to the reporter of an incident. Further details of specific enforcement policies
|
||||
may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good
|
||||
faith may face temporary or permanent repercussions as determined by other
|
||||
members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see
|
||||
https://www.contributor-covenant.org/faq
|
||||
421
CONTRIBUTING.md
Normal file
421
CONTRIBUTING.md
Normal file
@@ -0,0 +1,421 @@
|
||||
# Contributing to NV-Ingest
|
||||
|
||||
External contributions will be welcome soon, and they are greatly appreciated! Every little bit helps, and credit will always be given.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Filing Issues](#filing-issues)
|
||||
2. [Cloning the Repository](#cloning-the-repository)
|
||||
3. [Code Contributions](#code-contributions)
|
||||
- [Your First Issue](#your-first-issue)
|
||||
- [Seasoned Developers](#seasoned-developers)
|
||||
- [Workflow](#workflow)
|
||||
- [Common Processing Patterns](#common-processing-patterns)
|
||||
- [traceable](#traceable)
|
||||
- [nv_ingest_node_failure_context_manager](#nv_ingest_node_failure_context_manager)
|
||||
- [filter_by_task](#filter_by_task)
|
||||
- [cm_skip_processing_if_failed](#cm_skip_processing_if_failed)
|
||||
- [Adding a New Stage or Module](#adding-a-new-stage-or-module)
|
||||
- [Common Practices for Writing Unit Tests](#common-practices-for-writing-unit-tests)
|
||||
- [General Guidelines](#general-guidelines)
|
||||
- [Mocking External Services](#mocking-external-services)
|
||||
- [Submodules, Third Party Libraries, and Models](#submodules-third-party-libraries-and-models)
|
||||
- [Submodules](#submodules)
|
||||
- [Models](#models)
|
||||
4. [Architectural Guidelines](#architectural-guidelines)
|
||||
- [Single Responsibility Principle (SRP)](#1-single-responsibility-principle-srp)
|
||||
- [Interface Segregation Principle (ISP)](#2-interface-segregation-principle-isp)
|
||||
- [Dependency Inversion Principle (DIP)](#3-dependency-inversion-principle-dip)
|
||||
- [Physical Design Structure Mirroring Logical Design Structure](#4-physical-design-structure-mirroring-logical-design-structure)
|
||||
- [Levelization](#5-levelization)
|
||||
- [Acyclic Dependencies Principle (ADP)](#6-acyclic-dependencies-principle-adp)
|
||||
- [Package Cohesion Principles](#7-package-cohesion-principles)
|
||||
- [Common Closure Principle (CCP)](#common-closure-principle-ccp)
|
||||
- [Common Reuse Principle (CRP)](#common-reuse-principle-crp)
|
||||
- [Encapsulate What Varies](#8-encapsulate-what-varies)
|
||||
- [Favor Composition Over Inheritance](#9-favor-composition-over-inheritance)
|
||||
- [Clean Separation of Concerns (SoC)](#10-clean-separation-of-concerns-soc)
|
||||
- [Principle of Least Knowledge (Law of Demeter)](#11-principle-of-least-knowledge-law-of-demeter)
|
||||
- [Document Assumptions and Decisions](#12-document-assumptions-and-decisions)
|
||||
- [Continuous Integration and Testing](#13-continuous-integration-and-testing)
|
||||
5. [Licensing](#licensing)
|
||||
6. [Attribution](#attribution)
|
||||
|
||||
## Filing Issues
|
||||
|
||||
1. **Bug Reports, Feature Requests, and Documentation Issues:** Please file
|
||||
an [issue](https://github.com/NVIDIA/nv-ingest/issues) with a detailed
|
||||
description of
|
||||
the problem, feature request, or documentation issue. The NV-Ingest team will review and triage these issues,
|
||||
scheduling them for a future release.
|
||||
|
||||
## Cloning the repository
|
||||
|
||||
```bash
|
||||
DATASET_ROOT=[path to your dataset root]
|
||||
MODULE_NAME=[]
|
||||
MORPHEUS_ROOT=[path to your Morpheus root]
|
||||
NV_INGEST_ROOT=[path to your NV-Ingest root]
|
||||
git clone https://github.com/nv-morpheus/Morpheus.git $MORPHEUS_ROOT
|
||||
git clone https://github.com/NVIDIA/nv-ingest.git $NV_INGEST_ROOT
|
||||
cd $NV_INGEST_ROOT
|
||||
```
|
||||
|
||||
Ensure all submodules are checked out:
|
||||
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
## Code Contributions
|
||||
|
||||
### Your First Issue
|
||||
|
||||
1. **Finding an Issue:** Start with issues
|
||||
labeled [good first issue](https://github.com/NVIDIA/nv-ingest/labels/bug).
|
||||
2. **Claim an Issue:** Comment on the issue you wish to work on.
|
||||
3. **Implement Your Solution:** Dive into the code! Update or add unit tests as necessary.
|
||||
4. **Submit Your Pull Request:
|
||||
** [Create a pull request](https://github.com/NVIDIA/nv-ingest/pulls) once your
|
||||
code is ready.
|
||||
5. **Code Review:** Wait for the review by other developers and make necessary updates.
|
||||
6. **Merge:** Once approved, an NV-Ingest developer will approve your pull request.
|
||||
|
||||
### Seasoned Developers
|
||||
|
||||
For those familiar with the codebase, please check
|
||||
the [project boards](https://github.com/orgs/NVIDIA/projects/48/views/1) for
|
||||
issues. Look for unassigned issues and follow the steps starting from **Claim an Issue**.
|
||||
|
||||
### Workflow
|
||||
|
||||
1. **NV-Ingest Foundation**: Built on top
|
||||
of [NVIDIA Morpheus](https://github.com/nv-morpheus/Morpheus/blob/branch-24.10/docs/source/developer_guide/architecture.md).
|
||||
|
||||
2. **Pipeline Structure**: Designed around a pipeline that processes individual jobs within an asynchronous execution
|
||||
graph. Each job is processed by a series of stages or task handlers.
|
||||
|
||||
3. **Job Composition**: Jobs consist of a data payload, metadata, and task specifications that determine the processing
|
||||
steps applied to the data.
|
||||
|
||||
4. **Job Submission**:
|
||||
|
||||
- A job is submitted as a JSON specification and converted into
|
||||
a [ControlMessage](https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/docs/source/developer_guide/guides/9_control_messages.md),
|
||||
with the payload consisting of a cuDF dataframe.
|
||||
- For example:
|
||||
```text
|
||||
document_type source_id uuid metadata
|
||||
0 pdf somefile 1234 { ... }
|
||||
```
|
||||
- The `metadata` column contents correspond to
|
||||
the [schema-enforced metadata format of returned data](docs/content-metadata.md).
|
||||
|
||||
5. **Pipeline Processing**:
|
||||
|
||||
- The `ControlMessage` is passed through the pipeline, where each stage processes the data and metadata as needed.
|
||||
- Subsequent stages may add, transform, or filter data as needed, with all resulting artifacts stored in
|
||||
the `ControlMessage`'s payload.
|
||||
- For example, after processing, the payload may look like:
|
||||
```text
|
||||
document_type source_id uuid metadata
|
||||
0 text somefile abcd-1234 {'content': "The quick brown fox jumped...", ...}
|
||||
1 image somefile efgh-5678 {'content': "base64 encoded image", ...}
|
||||
2 image somefile xyza-5618 {'content': "base64 encoded image", ...}
|
||||
3 image somefile zxya-5628 {'content': "base64 encoded image", ...}
|
||||
4 status somefile kvq9-5600 {'content': "", 'status': "filtered", ...}
|
||||
```
|
||||
- A single job can result in multiple artifacts, each with its own metadata element definition.
|
||||
|
||||
6. **Job Completion**:
|
||||
- Upon reaching the end of the pipeline, the `ControlMessage` is converted into a `JobResult` object and pushed to
|
||||
the ephemeral output queue for client retrieval.
|
||||
- `JobResult` objects consist of a dictionary containing:
|
||||
1. **data**: A list of metadata artifacts produced by the job.
|
||||
2. **status**: The job status as success or failure.
|
||||
3. **description**: A human-readable description of the job status.
|
||||
4. **trace**: A list of timing traces generated during the job's processing.
|
||||
5. **annotations**: A list of task annotations generated during the job's processing.
|
||||
|
||||
### Common Processing Patterns
|
||||
|
||||
In NV-Ingest, decorators are used to enhance the functionality of functions by adding additional processing logic. These
|
||||
decorators help ensure consistency, traceability, and robust error handling across the pipeline. Below, we introduce
|
||||
some common decorators used in NV-Ingest, explain their usage, and provide examples.
|
||||
|
||||
#### `traceable` -> `src/nv_ingest/util/tracing/tagging.py`
|
||||
|
||||
The `traceable` decorator adds entry and exit trace timestamps to a `ControlMessage`'s metadata. This helps in
|
||||
monitoring and debugging by recording the time taken for function execution.
|
||||
|
||||
**Usage:**
|
||||
|
||||
- To track function execution time with default trace names:
|
||||
```python
|
||||
@traceable()
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
- To use a custom trace name:
|
||||
```python
|
||||
@traceable(trace_name="CustomTraceName")
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
|
||||
#### `nv_ingest_node_failure_context_manager` -> `src/nv_ingest/util/exception_handlers/decorators.py`
|
||||
|
||||
This decorator wraps a function with failure handling logic to manage potential failures involving `ControlMessages`. It
|
||||
ensures that failures are managed consistently, optionally raising exceptions or annotating the `ControlMessage`.
|
||||
|
||||
**Usage:**
|
||||
|
||||
- To handle failures with default settings:
|
||||
```python
|
||||
@nv_ingest_node_failure_context_manager(annotation_id="example_task")
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
- To handle failures and allow empty payloads:
|
||||
```python
|
||||
@nv_ingest_node_failure_context_manager(annotation_id="example_task", payload_can_be_empty=True)
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
|
||||
#### `filter_by_task` -> `src/nv_ingest/util/flow_control/filter_by_task.py`
|
||||
|
||||
The `filter_by_task` decorator checks if the `ControlMessage` contains any of the specified tasks. Each task can be a
|
||||
string of the task name or a tuple of the task name and task properties. If the message does not contain any listed task
|
||||
and/or task properties, the message is returned directly without calling the wrapped function, unless a forwarding
|
||||
function is provided.
|
||||
|
||||
**Usage:**
|
||||
|
||||
- To filter messages based on tasks:
|
||||
```python
|
||||
@filter_by_task(["task1", "task2"])
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
- To filter messages based on tasks with specific properties:
|
||||
```python
|
||||
@filter_by_task([("task", {"prop": "value"})])
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
- To forward messages to another function. This is necessary when the decorated function does not return the message
|
||||
directly, but instead forwards it to another function. In this case, the forwarding function should be provided as an
|
||||
argument to the decorator.
|
||||
```python
|
||||
@filter_by_task(["task1", "task2"], forward_func=other_function)
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
|
||||
#### `cm_skip_processing_if_failed` -> `morpheus/utils/control_message_utils.py`
|
||||
|
||||
The `cm_skip_processing_if_failed` decorator skips the processing of a `ControlMessage` if it has already failed. This
|
||||
ensures that no further processing is attempted on a failed message, maintaining the integrity of the pipeline.
|
||||
|
||||
**Usage:**
|
||||
|
||||
- To skip processing if the message has failed:
|
||||
```python
|
||||
@cm_skip_processing_if_failed
|
||||
def process_message(message):
|
||||
pass
|
||||
```
|
||||
|
||||
### Adding a New Stage or Module
|
||||
|
||||
#### TODO(Devin): Add details about adding a new stage or module once we have router node functionality in place.
|
||||
|
||||
### Common Practices for Writing Unit Tests
|
||||
|
||||
Writing unit tests is essential for maintaining code quality and ensuring that changes do not introduce new bugs. In
|
||||
this project, we use `pytest` for running tests and adopt blackbox testing principles. Below are some common practices
|
||||
for writing unit tests, which are located in the `[repo_root]/tests` directory.
|
||||
|
||||
#### General Guidelines
|
||||
|
||||
1. **Test Structure**: Each test module should test a specific module or functionality within the codebase. The test
|
||||
module should be named `test_<module_name>.py`, and reside on a mirrored physical path to its corresponding test
|
||||
target to be easily discoverable by `pytest`.
|
||||
|
||||
1. Example: `nv_ingest/some_path/another_path/my_module.py` should have a corresponding test file:
|
||||
`tests/some_path/another_path/test_my_module.py`.
|
||||
|
||||
2. **Test Functions**: Each test function should focus on a single aspect of the functionality. Use descriptive names
|
||||
that clearly indicate what is being tested. For example, `test_function_returns_correct_value`
|
||||
or `test_function_handles_invalid_input`.
|
||||
|
||||
3. **Setup and Teardown**: Use `pytest` fixtures to manage setup and teardown operations for your tests. Fixtures help
|
||||
in creating a consistent and reusable setup environment.
|
||||
|
||||
4. **Assertions**: Use assertions to validate the behavior of the code. Ensure that the tests cover both expected
|
||||
outcomes and edge cases.
|
||||
|
||||
#### Mocking External Services
|
||||
|
||||
When writing tests that depend on external services (e.g., databases, APIs), it is important to mock these dependencies
|
||||
to ensure that tests are reliable, fast, and do not depend on external factors.
|
||||
|
||||
1. **Mocking Libraries**: Use libraries like `unittest.mock` to create mocks for external services. The `pytest-mock`
|
||||
plugin can also be used to integrate mocking capabilities directly with `pytest`.
|
||||
|
||||
2. **Mock Objects**: Create mock objects to simulate the behavior of external services. Use these mocks to test how your
|
||||
code interacts with these services without making actual network calls or database transactions.
|
||||
|
||||
3. **Patching**: Use `patch` to replace real objects in your code with mocks. This can be done at the function, method,
|
||||
or object level. Ensure that patches are applied in the correct scope to avoid side effects.
|
||||
|
||||
#### Example Test Structure
|
||||
|
||||
Here is an example of how to structure a test module in the `[repo_root]/tests` directory:
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from unittest.mock import patch, Mock
|
||||
|
||||
# Assuming the module to test is located at [repo_root]/module.py
|
||||
from module import function_to_test
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_external_service():
|
||||
with patch('module.ExternalService') as mock_service:
|
||||
yield mock_service
|
||||
|
||||
|
||||
def test_function_returns_correct_value(mock_external_service):
|
||||
# Arrange
|
||||
mock_external_service.return_value.some_method.return_value = 'expected_value'
|
||||
|
||||
# Act
|
||||
result = function_to_test()
|
||||
|
||||
# Assert
|
||||
assert result == 'expected_value'
|
||||
|
||||
|
||||
def test_function_handles_invalid_input(mock_external_service):
|
||||
# Arrange
|
||||
mock_external_service.return_value.some_method.side_effect = ValueError("Invalid input")
|
||||
|
||||
# Act and Assert
|
||||
with pytest.raises(ValueError, match="Invalid input"):
|
||||
function_to_test(invalid_input)
|
||||
```
|
||||
|
||||
## Submodules, Third Party Libraries, and Models
|
||||
|
||||
### Submodules
|
||||
|
||||
1. Submodules are used to manage third-party libraries and dependencies.
|
||||
2. Submodules should be created in the `third_party` directory.
|
||||
3. Ensure that the submodule is updated to the latest commit before making changes.
|
||||
|
||||
### Models
|
||||
|
||||
1. **Model Integration**: NV-Ingest is designed to be scalable and flexible, so running models directly in the pipeline
|
||||
is discouraged.
|
||||
2. **Model Export**: Models should be exported to a format compatible with Triton Inference Server or TensorRT.
|
||||
- Model acquisition and conversion should be documented in `triton_models/README.md`, including the model name,
|
||||
version, pbtxt file, Triton model files, etc., along with an example of how to query the model in Triton.
|
||||
- Models should be externally hosted and downloaded during the pipeline execution, or added via LFS.
|
||||
- Any additional code, configuration files, or scripts required to run the model should be included in
|
||||
the `triton_models/[MODEL_NAME]` directory.
|
||||
3. **Self-Contained Dependencies**: No assumptions should be made regarding other models or libraries being available in
|
||||
the pipeline. All dependencies should be self-contained.
|
||||
4. **Base Triton Container**: Directions for the creation of the base Triton container are listed in
|
||||
the `triton_models/README.md` file. If a new model requires additional base dependencies, please update
|
||||
the `Dockerfile` in the `triton_models` directory.
|
||||
|
||||
## Architectural Guidelines
|
||||
|
||||
To ensure the quality and maintainability of the NV-Ingest codebase, the following architectural guidelines should be
|
||||
followed:
|
||||
|
||||
### 1. Single Responsibility Principle (SRP)
|
||||
|
||||
- Ensure that each module, class, or function has only one reason to change.
|
||||
|
||||
### 2. Interface Segregation Principle (ISP)
|
||||
|
||||
- Avoid forcing clients to depend on interfaces they do not use.
|
||||
|
||||
### 3. Dependency Inversion Principle (DIP)
|
||||
|
||||
- High-level modules should not depend on low-level modules, both should depend on abstractions.
|
||||
|
||||
### 4. Physical Design Structure Mirroring Logical Design Structure
|
||||
|
||||
- The physical layout of the codebase should reflect its logical structure.
|
||||
|
||||
### 5. Levelization
|
||||
|
||||
- Organize code into levels where higher-level components depend on lower-level components but not vice versa.
|
||||
|
||||
### 6. Acyclic Dependencies Principle (ADP)
|
||||
|
||||
- Ensure the dependency graph of packages/modules has no cycles.
|
||||
|
||||
### 7. Package Cohesion Principles
|
||||
|
||||
#### Common Closure Principle (CCP)
|
||||
|
||||
- Package classes that change together.
|
||||
|
||||
#### Common Reuse Principle (CRP)
|
||||
|
||||
- Package classes that are used together.
|
||||
|
||||
### 8. Encapsulate What Varies
|
||||
|
||||
- Identify aspects of the application that vary and separate them from what stays the same.
|
||||
|
||||
### 9. Favor Composition Over Inheritance
|
||||
|
||||
- Utilize object composition over class inheritance for behavior reuse where possible.
|
||||
|
||||
### 10. Clean Separation of Concerns (SoC)
|
||||
|
||||
- Divide the application into distinct features with minimal overlap in functionality.
|
||||
|
||||
### 11. Principle of Least Knowledge (Law of Demeter)
|
||||
|
||||
- Objects should assume as little as possible about the structure or properties of anything else, including their
|
||||
subcomponents.
|
||||
|
||||
### 12. Document Assumptions and Decisions
|
||||
|
||||
- Assumptions made and reasons behind architectural and design decisions should be clearly documented.
|
||||
|
||||
### 13. Continuous Integration and Testing
|
||||
|
||||
- Integrate code frequently into a shared repository and ensure comprehensive testing is an integral part of the
|
||||
development cycle.
|
||||
|
||||
Contributors are encouraged to follow these guidelines to ensure contributions are in line with the project's
|
||||
architectural consistency and maintainability.
|
||||
|
||||
## Licensing
|
||||
|
||||
NV-Ingest is licensed under the NVIDIA Proprietary Software License -- ensure that any contributions are compatible.
|
||||
|
||||
The following should be included in the header of any new files:
|
||||
|
||||
```text
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
```
|
||||
|
||||
## Attribution
|
||||
|
||||
Portions adopted from
|
||||
|
||||
- [https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/CONTRIBUTING.md](https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/CONTRIBUTING.md)
|
||||
- [https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md)
|
||||
- [https://github.com/dask/dask/blob/master/docs/source/develop.rst](https://github.com/dask/dask/blob/master/docs/source/develop.rst)
|
||||
91
Dockerfile
Normal file
91
Dockerfile
Normal file
@@ -0,0 +1,91 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# syntax=docker/dockerfile:1.3
|
||||
|
||||
ARG BASE_IMG=nvcr.io/nvidia/morpheus/morpheus
|
||||
ARG BASE_IMG_TAG=v24.06.01-runtime
|
||||
|
||||
# Use NVIDIA Morpheus as the base image
|
||||
FROM $BASE_IMG:$BASE_IMG_TAG AS base
|
||||
|
||||
ARG RELEASE_TYPE="dev"
|
||||
ARG VERSION=""
|
||||
ARG VERSION_REV="0"
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install --yes \
|
||||
libgl1-mesa-glx
|
||||
|
||||
# Copy the module code
|
||||
COPY setup.py setup.py
|
||||
# Don't copy full source here, pipelines won't be installed via setup anyway, and this allows us to rebuild more quickly if we're just changing the pipeline
|
||||
COPY src/nv_ingest src/nv_ingest
|
||||
COPY client client
|
||||
COPY ci ci
|
||||
COPY requirements.txt test-requirements.txt util-requirements.txt ./
|
||||
|
||||
RUN rm -rf ./src/nv_ingest/dist ./client/dist
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# Prevent haystack from ending telemetry data
|
||||
ENV HAYSTACK_TELEMETRY_ENABLED=False
|
||||
|
||||
# Ensure the NV_INGEST_VERSION is PEP 440 compatible
|
||||
RUN if [ -z "${VERSION}" ]; then \
|
||||
export VERSION="$(date +'%Y.%m.%d')"; \
|
||||
fi; \
|
||||
if [ "${RELEASE_TYPE}" = "dev" ]; then \
|
||||
export NV_INGEST_VERSION_OVERRIDE="${VERSION}.dev${VERSION_REV}"; \
|
||||
elif [ "${RELEASE_TYPE}" = "release" ]; then \
|
||||
export NV_INGEST_VERSION_OVERRIDE="${VERSION}.post${VERSION_REV}"; \
|
||||
else \
|
||||
echo "Invalid RELEASE_TYPE: ${RELEASE_TYPE}"; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
ENV NV_INGEST_RELEASE_TYPE=${RELEASE_TYPE}
|
||||
ENV NV_INGEST_VERSION_OVERRIDE=${NV_INGEST_VERSION_OVERRIDE}
|
||||
ENV NV_INGEST_CLIENT_VERSION_OVERRIDE=${NV_INGEST_VERSION_OVERRIDE}
|
||||
|
||||
# Run the build_pip_packages.sh script with the specified build type and library
|
||||
RUN chmod +x ./ci/scripts/build_pip_packages.sh \
|
||||
&& ./ci/scripts/build_pip_packages.sh --type ${RELEASE_TYPE} --lib client \
|
||||
&& ./ci/scripts/build_pip_packages.sh --type ${RELEASE_TYPE} --lib service
|
||||
|
||||
RUN source activate morpheus \
|
||||
&& pip install ./dist/*.whl
|
||||
|
||||
RUN source activate morpheus \
|
||||
&& rm -rf src requirements.txt test-requirements.txt util-requirements.txt
|
||||
|
||||
# Interim pyarrow backport until folded into upstream dependency tree
|
||||
RUN source activate morpheus \
|
||||
&& conda install https://anaconda.org/conda-forge/pyarrow/14.0.2/download/linux-64/pyarrow-14.0.2-py310h188ebfb_19_cuda.conda
|
||||
|
||||
# Upgrade setuptools to mitigate https://github.com/advisories/GHSA-cx63-2mw6-8hw5
|
||||
RUN source activate base \
|
||||
&& conda install setuptools==70.0.0
|
||||
|
||||
FROM base AS runtime
|
||||
|
||||
RUN source activate morpheus \
|
||||
&& pip install ./client/dist/*.whl \
|
||||
&& rm -rf client/dist
|
||||
|
||||
COPY src/pipeline.py ./
|
||||
COPY pyproject.toml ./
|
||||
COPY ./docker/scripts/entrypoint_source_ext.sh /opt/docker/bin/entrypoint_source
|
||||
|
||||
CMD ["python", "/workspace/pipeline.py"]
|
||||
|
||||
FROM base AS development
|
||||
|
||||
RUN source activate morpheus && \
|
||||
pip install -e ./client
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
201
LICENSE
Normal file
201
LICENSE
Normal file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
344
README.md
Normal file
344
README.md
Normal file
@@ -0,0 +1,344 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
|
||||
## NVIDIA-Ingest: Multi-modal data extraction
|
||||
|
||||
NVIDIA-Ingest is a scalable, performance-oriented document content and metadata extraction microservice. Including support for parsing PDFs, Word and PowerPoint documents, it uses specialized NVIDIA NIM microservices to find, contextualize, and extract text, tables, charts and images for use in downstream generative applications.
|
||||
|
||||
NVIDIA Ingest enables parallelization of the process of splitting documents into pages where contents are classified (as tables, charts, images, text), extracted into discrete content, and further contextualized via optical character recognition (OCR) into a well defined JSON schema. From there, NVIDIA Ingest can optionally manage computation of embeddings for the extracted content, and the process storing into a vector database [Milvus](https://milvus.io/).
|
||||
|
||||
### What it is
|
||||
|
||||
A microservice that:
|
||||
|
||||
- Accepts a JSON Job description, containing a document payload, and a set of ingestion tasks to perform on that
|
||||
payload.
|
||||
- Allows the results of a Job to be retrieved; the result is a JSON dictionary containing a list of Metadata describing
|
||||
objects extracted from the base document, as well as processing annotations and timing/trace data.
|
||||
- Supports PDF, Docx, pptx, and images.
|
||||
- Supports multiple methods of extraction for each document type in order to balance trade-offs between throughput and
|
||||
accuracy. For example, for PDF documents we support extraction via pdfium, Unstructured.io, and Adobe Content Extraction Services.
|
||||
- Supports various types of pre and post processing operations, including text splitting and chunking;
|
||||
transform, and filtering; embedding generation, and image offloading to storage.
|
||||
|
||||
### What it is not
|
||||
|
||||
A service that:
|
||||
|
||||
- Runs a static pipeline or fixed set of operations on every submitted document.
|
||||
- Acts as a wrapper for any specific document parsing library.
|
||||
|
||||
|
||||
## Quickstart
|
||||
|
||||
To get started using NVIDIA Ingest, you need to do a few things:
|
||||
1. [Start supporting NIM microservices](#step-1-starting-containers)
|
||||
2. [Install the NVIDIA Ingest client dependencies in a Python environment](#step-2-installing-python-dependencies)
|
||||
3. [Submit ingestion job(s)](#step-3-ingesting-documents)
|
||||
4. [Inspect and consume results](#step-4-inspecting-and-consuming-results)
|
||||
|
||||
### Step 1: Starting containers
|
||||
|
||||
This example demonstrates how to use the provided [docker-compose.yml](docker-compose.yaml) to build and start all needed services with two commands.
|
||||
|
||||
If preferred, you can also [start services one by one](docs/deployment.md), or run on Kubernetes via [our Helm chart](helm/README.md). Also of note are [additional environment variables](docs/environment-config.md) you may wish to configure.
|
||||
|
||||
First, git clone the repo:
|
||||
`git clone https://github.com/nvidia/nv-ingest` and `cd nv-ingest`.
|
||||
|
||||
For Docker container images to be able to access to pre-built containers and NIM microservices, create a .env file and set up your API keys in it:
|
||||
```
|
||||
NIM_NGC_API_KEY=...
|
||||
NGC_API_KEY=...
|
||||
NGC_CLI_API_KEY=...
|
||||
DATASET_ROOT=<PATH_TO_THIS_REPO>/data
|
||||
NV_INGEST_ROOT=<PATH_TO_THIS_REPO>
|
||||
```
|
||||
|
||||
To build Docker images locally:
|
||||
|
||||
`docker compose build`
|
||||
|
||||
Note: As configured by default in [docker-compose.yml](docker-compose.yaml), the YOLOX, DePlot, and CACHED NIM models are each pinned to a dedicated GPU. The PaddleOCR and nv-embedqa-e5 NIM models and the nv-ingest-ms-runtime share a fourth. Thus our minimum requirements are 4x NVIDIA A100 or H100 Tensor Core GPUs.
|
||||
|
||||
To start all services:
|
||||
`docker compose up`
|
||||
|
||||
Please note, NIM containers on their first startup can take 10-15 minutes to pull and fully load models. Also note that by default we have [configured log levels to be verbose](docker-compose.yaml#L31) so it's possible to observe service startup proceeding. You will notice _many_ log messages. You can control this on a per service level via each service's environment variables.
|
||||
|
||||
When all services have fully started, `nvidia-smi` should show processes like the following:
|
||||
```
|
||||
+---------------------------------------------------------------------------------------+
|
||||
| Processes: |
|
||||
| GPU GI CI PID Type Process name GPU Memory |
|
||||
| ID ID Usage |
|
||||
|=======================================================================================|
|
||||
| 0 N/A N/A 1352957 C tritonserver 762MiB |
|
||||
| 1 N/A N/A 1322081 C /opt/nim/llm/.venv/bin/python3 63916MiB |
|
||||
| 2 N/A N/A 1355175 C tritonserver 478MiB |
|
||||
| 2 N/A N/A 1367569 C ...s/python/triton_python_backend_stub 12MiB |
|
||||
| 3 N/A N/A 1321841 C python 414MiB |
|
||||
| 3 N/A N/A 1352331 C tritonserver 478MiB |
|
||||
| 3 N/A N/A 1355929 C ...s/python/triton_python_backend_stub 424MiB |
|
||||
| 3 N/A N/A 1373202 C tritonserver 414MiB |
|
||||
+---------------------------------------------------------------------------------------+
|
||||
|
||||
```
|
||||
If it's taking > 1m for `nvidia-smi` to return, it's likely the bus is still busy setting up the models.
|
||||
|
||||
Once it completes normally (less than a few seconds), the NIM models are ready.
|
||||
|
||||
### Step 2: Installing Python dependencies
|
||||
|
||||
|
||||
On the host, you'll need to create a Python environment and install dependencies:
|
||||
```
|
||||
conda create --name nv-ingest-dev python=3.10
|
||||
conda activate nv-ingest-dev
|
||||
cd client
|
||||
pip install -r ./requirements.txt
|
||||
pip install e .
|
||||
```
|
||||
|
||||
### Step 3: Ingesting Documents
|
||||
|
||||
You can submit jobs programmatically in Python or via the nv-ingest-cli tool.
|
||||
|
||||
In Python (find the complete script [here](https://github.com/NVIDIA/nv-ingest/blob/main/client/examples/sample_job.py)):
|
||||
```
|
||||
# create and submit a multi modal extraction job
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
extract_task = ExtractTask(
|
||||
document_type=file_type,
|
||||
extract_text=True,
|
||||
extract_images=True,
|
||||
)
|
||||
|
||||
job_spec.add_task(extract_task)
|
||||
job_id = client.add_job(job_spec)
|
||||
|
||||
client.submit_job(job_id, "morpheus_task_queue")
|
||||
|
||||
result = client.fetch_job_result(job_id)
|
||||
# Get back the extracted pdf data
|
||||
print(f"Got {len(result)} results")
|
||||
```
|
||||
|
||||
Using the the `nv-ingest-cli`:
|
||||
|
||||
```shell
|
||||
nv-ingest-cli \
|
||||
--doc ./data/test.pdf \
|
||||
--output_directory ./processed_docs \
|
||||
--task='extract:{"document_type": "pdf", "extract_method": "pdfium"}' \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
```
|
||||
|
||||
You should notice output indicating document processing status:
|
||||
```
|
||||
INFO:nv_ingest_client.nv_ingest_cli:Processing 1 documents.
|
||||
INFO:nv_ingest_client.nv_ingest_cli:Output will be written to: ./processed_docs
|
||||
Processing files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.29file/s, pages_per_sec=1.27]
|
||||
INFO:nv_ingest_client.cli.util.processing:dedup_images: Avg: 0.63 ms, Median: 0.63 ms, Total Time: 0.63 ms, Total % of Trace Computation: 0.09%
|
||||
INFO:nv_ingest_client.cli.util.processing:dedup_images_channel_in: Avg: 3.68 ms, Median: 3.68 ms, Total Time: 3.68 ms, Total % of Trace Computation: 0.51%
|
||||
INFO:nv_ingest_client.cli.util.processing:docx_content_extractor: Avg: 0.95 ms, Median: 0.95 ms, Total Time: 0.95 ms, Total % of Trace Computation: 0.13%
|
||||
INFO:nv_ingest_client.cli.util.processing:docx_content_extractor_channel_in: Avg: 1.47 ms, Median: 1.47 ms, Total Time: 1.47 ms, Total % of Trace Computation: 0.20%
|
||||
INFO:nv_ingest_client.cli.util.processing:filter_images: Avg: 1.12 ms, Median: 1.12 ms, Total Time: 1.12 ms, Total % of Trace Computation: 0.15%
|
||||
INFO:nv_ingest_client.cli.util.processing:filter_images_channel_in: Avg: 3.54 ms, Median: 3.54 ms, Total Time: 3.54 ms, Total % of Trace Computation: 0.49%
|
||||
INFO:nv_ingest_client.cli.util.processing:job_counter: Avg: 7.66 ms, Median: 7.66 ms, Total Time: 7.66 ms, Total % of Trace Computation: 1.06%
|
||||
INFO:nv_ingest_client.cli.util.processing:job_counter_channel_in: Avg: 0.26 ms, Median: 0.26 ms, Total Time: 0.26 ms, Total % of Trace Computation: 0.04%
|
||||
INFO:nv_ingest_client.cli.util.processing:metadata_injection: Avg: 34.42 ms, Median: 34.42 ms, Total Time: 34.42 ms, Total % of Trace Computation: 4.74%
|
||||
INFO:nv_ingest_client.cli.util.processing:metadata_injection_channel_in: Avg: 0.20 ms, Median: 0.20 ms, Total Time: 0.20 ms, Total % of Trace Computation: 0.03%
|
||||
INFO:nv_ingest_client.cli.util.processing:pdf_content_extractor: Avg: 619.98 ms, Median: 619.98 ms, Total Time: 619.98 ms, Total % of Trace Computation: 85.42%
|
||||
INFO:nv_ingest_client.cli.util.processing:pdf_content_extractor_channel_in: Avg: 0.76 ms, Median: 0.76 ms, Total Time: 0.76 ms, Total % of Trace Computation: 0.10%
|
||||
INFO:nv_ingest_client.cli.util.processing:pptx_content_extractor: Avg: 11.57 ms, Median: 11.57 ms, Total Time: 11.57 ms, Total % of Trace Computation: 1.59%
|
||||
INFO:nv_ingest_client.cli.util.processing:pptx_content_extractor_channel_in: Avg: 2.02 ms, Median: 2.02 ms, Total Time: 2.02 ms, Total % of Trace Computation: 0.28%
|
||||
INFO:nv_ingest_client.cli.util.processing:redis_source_network_in: Avg: 16.11 ms, Median: 16.11 ms, Total Time: 16.11 ms, Total % of Trace Computation: 2.22%
|
||||
INFO:nv_ingest_client.cli.util.processing:redis_task_sink_channel_in: Avg: 2.58 ms, Median: 2.58 ms, Total Time: 2.58 ms, Total % of Trace Computation: 0.36%
|
||||
INFO:nv_ingest_client.cli.util.processing:redis_task_source: Avg: 18.81 ms, Median: 18.81 ms, Total Time: 18.81 ms, Total % of Trace Computation: 2.59%
|
||||
INFO:nv_ingest_client.cli.util.processing:Unresolved time: 66.51 ms, Percent of Total Elapsed: 8.39%
|
||||
INFO:nv_ingest_client.cli.util.processing:Processed 1 files in 0.79 seconds.
|
||||
INFO:nv_ingest_client.cli.util.processing:Total pages processed: 1
|
||||
INFO:nv_ingest_client.cli.util.processing:Throughput (Pages/sec): 1.26
|
||||
INFO:nv_ingest_client.cli.util.processing:Throughput (Files/sec): 1.26
|
||||
INFO:nv_ingest_client.cli.util.processing:Total timeouts: 0
|
||||
```
|
||||
|
||||
### Step 4: Inspecting and Consuming Results
|
||||
|
||||
After the ingestion steps above have completed, you should be able to find `text` and `image` subfolders inside your processed docs folder. Each will contain JSON formatted extracted content and metadata.
|
||||
|
||||
When processing has completed, you'll have separate result files for text and image data.
|
||||
|
||||
Expected text extracts:
|
||||
```shell
|
||||
cat ./processed_docs/text/test.pdf.metadata.json
|
||||
[{
|
||||
"document_type": "text",
|
||||
"metadata": {
|
||||
"content": "Here is one line of text. Here is another line of text. Here is an image.",
|
||||
"content_metadata": {
|
||||
"description": "Unstructured text from PDF document.",
|
||||
"hierarchy": {
|
||||
"block": -1,
|
||||
"line": -1,
|
||||
"page": -1,
|
||||
"page_count": 1,
|
||||
"span": -1
|
||||
},
|
||||
"page_number": -1,
|
||||
"type": "text"
|
||||
},
|
||||
"error_metadata": null,
|
||||
"image_metadata": null,
|
||||
"source_metadata": {
|
||||
"access_level": 1,
|
||||
"collection_id": "",
|
||||
"date_created": "2024-03-11T14:56:40.125063",
|
||||
"last_modified": "2024-03-11T14:56:40.125054",
|
||||
"partition_id": -1,
|
||||
"source_id": "test.pdf",
|
||||
"source_location": "",
|
||||
"source_name": "",
|
||||
"source_type": "PDF 1.4",
|
||||
"summary": ""
|
||||
},
|
||||
"text_metadata": {
|
||||
"keywords": "",
|
||||
"language": "en",
|
||||
"summary": "",
|
||||
"text_type": "document"
|
||||
}
|
||||
}
|
||||
]]```
|
||||
|
||||
Expected image extracts:
|
||||
```shell
|
||||
$ cat ./processed_docs/image/test.pdf.metadata.json
|
||||
[{
|
||||
"document_type": "image",
|
||||
"metadata": {
|
||||
"content": "<--- Base64 encoded image data --->",
|
||||
"content_metadata": {
|
||||
"description": "Image extracted from PDF document.",
|
||||
"hierarchy": {
|
||||
"block": 3,
|
||||
"line": -1,
|
||||
"page": 0,
|
||||
"page_count": 1,
|
||||
"span": -1
|
||||
},
|
||||
"page_number": 0,
|
||||
"type": "image"
|
||||
},
|
||||
"error_metadata": null,
|
||||
"image_metadata": {
|
||||
"caption": "",
|
||||
"image_location": [
|
||||
73.5,
|
||||
160.7775878906,
|
||||
541.5,
|
||||
472.7775878906
|
||||
],
|
||||
"image_type": "png",
|
||||
"structured_image_type": "image_type_1",
|
||||
"text": ""
|
||||
},
|
||||
"source_metadata": {
|
||||
"access_level": 1,
|
||||
"collection_id": "",
|
||||
"date_created": "2024-03-11T14:56:40.125063",
|
||||
"last_modified": "2024-03-11T14:56:40.125054",
|
||||
"partition_id": -1,
|
||||
"source_id": "test.pdf",
|
||||
"source_location": "",
|
||||
"source_name": "",
|
||||
"source_type": "PDF 1.4",
|
||||
"summary": ""
|
||||
},
|
||||
"text_metadata": null
|
||||
}
|
||||
}]
|
||||
```
|
||||
|
||||
We also provide a script for inspecting [extracted images](#image_viewerpy)
|
||||
```shell
|
||||
python src/util/image_viewer.py --file_path ./processed_docs/image/test.pdf.metadata.json
|
||||
```
|
||||
|
||||
Beyond inspecting the results, you can read them into something like a llama-index or langchain document query pipeline:
|
||||
|
||||
Please also checkout our [demo using a retrieval pipeline on build.nvidia.com](https://build.nvidia.com/nvidia/multimodal-pdf-data-extraction-for-enterprise-rag) to query over document content pre-extracted w/ NVIDIA Ingest.
|
||||
|
||||
## Third Party License Notice:
|
||||
|
||||
If configured to do so, this project will download and install additional third-party open source software projects.
|
||||
Review the license terms of these open source projects before use:
|
||||
|
||||
https://pypi.org/project/pdfservices-sdk/
|
||||
|
||||
- **`INSTALL_ADOBE_SDK`**:
|
||||
- **Description**: If set to `true`, the Adobe SDK will be installed in the container at launch time. This is
|
||||
required if you want to use the Adobe extraction service for PDF decomposition. Please review the "
|
||||
"[license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the
|
||||
pdfservices-sdk before enabling this option."
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original
|
||||
work, or you have rights to submit it under the same license, or a compatible license.
|
||||
|
||||
Any contribution which contains commits that are not Signed-Off will not be accepted.
|
||||
|
||||
To sign off on a commit you simply use the --signoff (or -s) option when committing your changes:
|
||||
|
||||
```
|
||||
$ git commit -s -m "Add cool feature."
|
||||
```
|
||||
|
||||
This will append the following to your commit message:
|
||||
|
||||
```
|
||||
Signed-off-by: Your Name <your@email.com>
|
||||
```
|
||||
|
||||
### Full text of the DCO:
|
||||
|
||||
```
|
||||
Developer Certificate of Origin
|
||||
Version 1.1
|
||||
|
||||
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
|
||||
1 Letterman Drive
|
||||
Suite D4700
|
||||
San Francisco, CA, 94129
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
|
||||
```
|
||||
|
||||
```
|
||||
Developer's Certificate of Origin 1.1
|
||||
|
||||
By making a contribution to this project, I certify that:
|
||||
|
||||
(a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
|
||||
|
||||
(b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
|
||||
|
||||
(c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
|
||||
|
||||
(d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
|
||||
```
|
||||
24
SECURITY.md
Normal file
24
SECURITY.md
Normal file
@@ -0,0 +1,24 @@
|
||||
## Security
|
||||
|
||||
NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization.
|
||||
|
||||
If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.**
|
||||
|
||||
## Reporting Potential Security Vulnerability in an NVIDIA Product
|
||||
|
||||
To report a potential security vulnerability in any NVIDIA product:
|
||||
- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
|
||||
- E-Mail: psirt@nvidia.com
|
||||
- We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
|
||||
- Please include the following information:
|
||||
- Product/Driver name and version/branch that contains the vulnerability
|
||||
- Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
|
||||
- Instructions to reproduce the vulnerability
|
||||
- Proof-of-concept or exploit code
|
||||
- Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
|
||||
|
||||
While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
|
||||
|
||||
## NVIDIA Product Security
|
||||
|
||||
For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
|
||||
55
ci/scripts/build_pip_packages.sh
Executable file
55
ci/scripts/build_pip_packages.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Function to display usage
|
||||
usage() {
|
||||
echo "Usage: $0 --type <dev|release> --lib <client|service>"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Get the directory of the current script
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
|
||||
# Parse options
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
--type) TYPE="$2"; shift ;;
|
||||
--lib) LIBRARY="$2"; shift ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# Validate input
|
||||
if [[ -z "$TYPE" || -z "$LIBRARY" ]]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
# Get current date
|
||||
DATE=$(date +'%Y.%m.%d')
|
||||
|
||||
# Set the version based on the build type
|
||||
if [[ "$TYPE" == "dev" ]]; then
|
||||
VERSION_SUFFIX="${DATE}-dev"
|
||||
elif [[ "$TYPE" == "release" ]]; then
|
||||
VERSION_SUFFIX="${DATE}"
|
||||
else
|
||||
echo "Invalid type: $TYPE"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Set library-specific variables and paths
|
||||
if [[ "$LIBRARY" == "client" ]]; then
|
||||
NV_INGEST_CLIENT_VERSION_OVERRIDE="${VERSION_SUFFIX}"
|
||||
export NV_INGEST_CLIENT_VERSION_OVERRIDE
|
||||
SETUP_PATH="$SCRIPT_DIR/../../client/setup.py"
|
||||
elif [[ "$LIBRARY" == "service" ]]; then
|
||||
NV_INGEST_SERVICE_VERSION_OVERRIDE="${VERSION_SUFFIX}"
|
||||
export NV_INGEST_SERVICE_VERSION_OVERRIDE
|
||||
SETUP_PATH="$SCRIPT_DIR/../../setup.py"
|
||||
else
|
||||
echo "Invalid library: $LIBRARY"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Build the wheel
|
||||
(cd "$(dirname "$SETUP_PATH")" && python setup.py sdist bdist_wheel)
|
||||
340
client/README.md
Normal file
340
client/README.md
Normal file
@@ -0,0 +1,340 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
# NV-Ingest-Client
|
||||
|
||||
NV-Ingest-Client is a powerful tool designed for efficient ingestion and processing of large datasets. It provides both
|
||||
a Python API and a command-line interface to cater to various ingestion needs.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Installation](#installation)
|
||||
2. [Usage](#usage)
|
||||
- [CLI Tool](#cli-tool)
|
||||
- [API Libraries](#api-libraries)
|
||||
3. [Command Line Interface (CLI)](#command-line-interface-cli)
|
||||
- [Command Overview](#command-overview)
|
||||
- [Options](#options)
|
||||
4. [Examples](#examples)
|
||||
5. [Configuration](#configuration)
|
||||
6. [Contributing](#contributing)
|
||||
7. [License](#license)
|
||||
|
||||
## Installation
|
||||
|
||||
To install NV-Ingest-Client, run the following command in your terminal:
|
||||
|
||||
```bash
|
||||
pip install [REPO_ROOT]/client
|
||||
```
|
||||
|
||||
This command installs both the API libraries and the `nv-ingest-cli` tool which can subsequently be called from the
|
||||
command line.
|
||||
|
||||
## Usage
|
||||
|
||||
TODO(Devin): Need to shift to sphinx, but for now, we can provide a brief overview of the API and CLI.
|
||||
|
||||
## API Libraries
|
||||
|
||||
### nv_ingest_client.primitives.jobs
|
||||
|
||||
#### JobSpec
|
||||
|
||||
Specification for creating a job for submission to the nv-ingest microservice.
|
||||
|
||||
- **Parameters**:
|
||||
|
||||
- `payload` (Dict): The payload data for the job.
|
||||
- `tasks` (Optional[List], optional): A list of tasks to be added to the job. Defaults to None.
|
||||
- `source_id` (Optional[str], optional): An identifier for the source of the job. Defaults to None.
|
||||
- `source_name` (Optional[str], optional): A name for the source of the job. Defaults to None.
|
||||
- `document_type` (Optional[str], optional): Type of the document. Defaults to 'txt'.
|
||||
- `job_id` (Optional[Union[UUID, str]], optional): A unique identifier for the job. Defaults to a new UUID.
|
||||
- `extended_options` (Optional[Dict], optional): Additional options for job processing. Defaults to None.
|
||||
|
||||
- **Attributes**:
|
||||
|
||||
- `_payload` (Dict): Storage for the payload data.
|
||||
- `_tasks` (List): Storage for the list of tasks.
|
||||
- `_source_id` (str): Storage for the source identifier.
|
||||
- `_job_id` (UUID): Storage for the job's unique identifier.
|
||||
- `_extended_options` (Dict): Storage for the additional options.
|
||||
|
||||
- **Methods**:
|
||||
|
||||
- **to_dict() -> Dict**:
|
||||
- **Description**: Converts the job specification to a dictionary for JSON serialization.
|
||||
- **Returns**: `Dict`: Dictionary representation of the job specification.
|
||||
- **add_task(task)**:
|
||||
- **Description**: Adds a task to the job specification.
|
||||
- **Parameters**:
|
||||
- `task`: The task to be added. Assumes the task has a `to_dict()` method.
|
||||
- **Raises**:
|
||||
- `ValueError`: If the task does not have a `to_dict()` method or is not an instance of `Task`.
|
||||
|
||||
- **Properties**:
|
||||
|
||||
- `payload`: Getter/Setter for the payload data.
|
||||
- `job_id`: Getter/Setter for the job's unique identifier.
|
||||
- `source_id`: Getter/Setter for the source identifier.
|
||||
- `source_name`: Getter/Setter for the source name.
|
||||
|
||||
- **Example Usage**:
|
||||
```python
|
||||
job_spec = JobSpec(
|
||||
payload={"data": "Example data"},
|
||||
tasks=[extract_task, split_task],
|
||||
source_id="12345",
|
||||
job_id="abcd-efgh-ijkl-mnop",
|
||||
extended_options={"tracing_options": {"trace": True}}
|
||||
)
|
||||
print(job_spec.to_dict())
|
||||
```
|
||||
|
||||
### nv_ingest_client.primitives.tasks
|
||||
|
||||
#### Task Factory
|
||||
|
||||
- **Function**: `task_factory(task_type, **kwargs)`
|
||||
|
||||
- **Description**: Factory method for creating task objects based on the provided task type. It dynamically selects
|
||||
the appropriate task class from a mapping and initializes it with any additional keyword arguments.
|
||||
- **Parameters**:
|
||||
- `task_type` (TaskType or str): The type of the task to create. Can be an enum member of `TaskType` or a string
|
||||
representing a valid task type.
|
||||
- `**kwargs` (dict): Additional keyword arguments to pass to the task's constructor.
|
||||
- **Returns**:
|
||||
- `Task`: An instance of the task corresponding to the given task type.
|
||||
- **Raises**:
|
||||
- `ValueError`: If an invalid task type is provided, or if any unexpected keyword arguments are passed that do
|
||||
not match the task constructor's parameters.
|
||||
|
||||
- **Example**:
|
||||
```python
|
||||
# Assuming TaskType has 'Extract' and 'Split' as valid members and corresponding classes are defined.
|
||||
extract_task = task_factory('extract', document_type='PDF', extract_text=True)
|
||||
split_task = task_factory('split', split_by='sentence', split_length=100)
|
||||
```
|
||||
|
||||
#### ExtractTask
|
||||
|
||||
Object for document extraction tasks, extending the `Task` class.
|
||||
|
||||
- **Method**: `__init__(document_type, extract_method='pdfium', extract_text=False, extract_images=False,
|
||||
extract_tables=False)`
|
||||
|
||||
- **Parameters**:
|
||||
- `document_type`: Type of document.
|
||||
- `extract_method`: Method used for extraction. Default is 'pdfium'.
|
||||
- `extract_text`: Boolean indicating if text should be extracted. Default is False.
|
||||
- `extract_images`: Boolean indicating if images should be extracted. Default is False.
|
||||
- `extract_tables`: Boolean indicating if tables should be extracted. Default is False.
|
||||
- **Description**: Sets up configuration for the extraction task.
|
||||
|
||||
- **Method: `to_dict()`**
|
||||
- **Description**: Converts task details to a dictionary for submission to message client. Includes handling for
|
||||
specific
|
||||
methods and document types.
|
||||
- **Returns**: `Dict`: Dictionary containing task type and properties.
|
||||
|
||||
#### SplitTask
|
||||
|
||||
Object for document splitting tasks, extending the `Task` class.
|
||||
|
||||
- **Method**: `__init__(split_by=None, split_length=None, split_overlap=None, max_character_length=None,
|
||||
sentence_window_size=None)`
|
||||
- **Parameters**:
|
||||
- `split_by`: Criterion for splitting, e.g., 'word', 'sentence', 'passage'.
|
||||
- `split_length`: The length of each split segment.
|
||||
- `split_overlap`: Overlap length between segments.
|
||||
- `max_character_length`: Maximum character length for a split.
|
||||
- `sentence_window_size`: Window size for sentence-based splits.
|
||||
- **Description**: Sets up configuration for the splitting task.
|
||||
- **Method: `to_dict()`**
|
||||
- **Description**: Converts task details to a dictionary for submission to message client.
|
||||
- **Returns**: `Dict`: Dictionary containing task type and properties.
|
||||
|
||||
### nv_ingest_client.client.client
|
||||
|
||||
The `NvIngestClient` class provides a comprehensive suite of methods to handle job submission and retrieval processes
|
||||
efficiently. Below are the public methods available:
|
||||
|
||||
### Initialization
|
||||
|
||||
- **`__init__`**:
|
||||
Initializes the NvIngestClient with customizable client allocator and Redis configuration.
|
||||
- **Parameters**:
|
||||
- `message_client_allocator`: A callable that returns an instance of the client used for communication.
|
||||
- `message_client_hostname`: Hostname of the message client server. Defaults to "localhost".
|
||||
- `message_client_port`: Port number of the message client server. Defaults to 6379.
|
||||
- `message_client_kwargs`: Additional keyword arguments for the message client.
|
||||
- `msg_counter_id`: Redis key for tracking message counts. Defaults to "nv-ingest-message-id".
|
||||
- `worker_pool_size`: Number of worker processes in the pool. Defaults to 1.
|
||||
|
||||
## Submission Methods
|
||||
|
||||
### submit_job
|
||||
|
||||
Submits a job to a specified job queue. This method can optionally wait for a response if blocking is set to True.
|
||||
|
||||
- **Parameters**:
|
||||
- `job_id`: The unique identifier of the job to be submitted.
|
||||
- `job_queue_id`: The ID of the job queue where the job will be submitted.
|
||||
- **Returns**:
|
||||
- Optional[Dict]: The job result if blocking is True and a result is available before the timeout, otherwise None.
|
||||
- **Raises**:
|
||||
- Exception: If submitting the job fails.
|
||||
|
||||
### submit_jobs
|
||||
|
||||
Submits multiple jobs to a specified job queue. This method does not wait for any of the jobs to complete.
|
||||
|
||||
- **Parameters**:
|
||||
- `job_ids`: A list of job IDs to be submitted.
|
||||
- `job_queue_id`: The ID of the job queue where the jobs will be submitted.
|
||||
- **Returns**:
|
||||
- List[Union[Dict, None]]: A list of job results if blocking is True and results are available before the timeout,
|
||||
otherwise None.
|
||||
|
||||
### submit_job_async
|
||||
|
||||
Asynchronously submits one or more jobs to a specified job queue using a thread pool. This method handles both single
|
||||
job ID or a list of job IDs.
|
||||
|
||||
- **Parameters**:
|
||||
- `job_ids`: A single job ID or a list of job IDs to be submitted.
|
||||
- `job_queue_id`: The ID of the job queue where the jobs will be submitted.
|
||||
- **Returns**:
|
||||
- Dict[Future, str]: A dictionary mapping futures to their respective job IDs for later retrieval of outcomes.
|
||||
- **Notes**:
|
||||
- This method queues the jobs for asynchronous submission and returns a mapping of futures to job IDs.
|
||||
- It does not wait for any of the jobs to complete.
|
||||
- Ensure that each job is in the proper state before submission.
|
||||
|
||||
## Job Retrieval
|
||||
|
||||
### fetch_job_result
|
||||
|
||||
- **Description**: Fetches the job result from a message client, handling potential errors and state changes.
|
||||
- **Method**: `fetch_job_result(job_id, timeout=10, data_only=True)`
|
||||
- **Parameters**:
|
||||
- `job_id` (str): The identifier of the job.
|
||||
- `timeout` (float, optional): Timeout for the fetch operation in seconds. Defaults to 10.
|
||||
- `data_only` (bool, optional): If true, only returns the data part of the job result.
|
||||
- **Returns**:
|
||||
- Tuple[Dict, str]: The job result and the job ID.
|
||||
- **Raises**:
|
||||
- `ValueError`: If there is an error in decoding the job result.
|
||||
- `TimeoutError`: If the fetch operation times out.
|
||||
- `Exception`: For all other unexpected issues.
|
||||
|
||||
### fetch_job_result_async
|
||||
|
||||
- **Description**: Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to
|
||||
job IDs.
|
||||
- **Method**: `fetch_job_result_async(job_ids, timeout=10, data_only=True)`
|
||||
- **Parameters**:
|
||||
- `job_ids` (Union[str, List[str]]): A single job ID or a list of job IDs.
|
||||
- `timeout` (float, optional): Timeout for fetching each job result, in seconds. Defaults to 10.
|
||||
- `data_only` (bool, optional): Whether to return only the data part of the job result.
|
||||
- **Returns**:
|
||||
- Dict[Future, str]: A dictionary mapping each future to its corresponding job ID.
|
||||
- **Raises**:
|
||||
- No explicit exceptions raised but leverages the exceptions from `fetch_job_result`.
|
||||
|
||||
## Job and Task Management
|
||||
|
||||
### job_count
|
||||
|
||||
- **Description**: Returns the number of jobs currently tracked by the client.
|
||||
- **Method**: `job_count()`
|
||||
- **Returns**: Integer representing the total number of jobs.
|
||||
|
||||
### add_job
|
||||
|
||||
- **Description**: Adds a job specification to the job tracking system.
|
||||
- **Method**: `add_job(job_spec)`
|
||||
- **Parameters**:
|
||||
- `job_spec` (JobSpec, optional): The job specification to add. If not provided, a new job ID will be generated.
|
||||
- **Returns**: String representing the job ID of the added job.
|
||||
- **Raises**:
|
||||
- `ValueError`: If a job with the specified job ID already exists.
|
||||
|
||||
### create_job
|
||||
|
||||
- **Description**: Creates a new job with specified parameters and adds it to the job tracking dictionary.
|
||||
- **Method**: `create_job(payload, source_id, source_name, document_type, tasks, job_id, extended_options)`
|
||||
- **Parameters**:
|
||||
- `payload` (str): The payload associated with the job.
|
||||
- `source_id` (str): The source identifier for the job.
|
||||
- `source_name` (str): The unique name of the job's source data.
|
||||
- `document_type` (str, optional): The type of document to be processed.
|
||||
- `tasks` (list, optional): A list of tasks to be associated with the job.
|
||||
- `job_id` (uuid.UUID | str, optional): The unique identifier for the job.
|
||||
- `extended_options` (dict, optional): Additional options for job creation.
|
||||
- **Returns**: String representing the job ID.
|
||||
- **Raises**:
|
||||
- `ValueError`: If a job with the specified job ID already exists.
|
||||
|
||||
### add_task
|
||||
|
||||
- **Description**: Adds a task to an existing job.
|
||||
- **Method**: `add_task(job_id, task)`
|
||||
- **Parameters**:
|
||||
- `job_id` (str): The job ID to which the task will be added.
|
||||
- `task` (Task): The task to add.
|
||||
- **Raises**:
|
||||
- `ValueError`: If the job does not exist or is not in the correct state.
|
||||
|
||||
### create_task
|
||||
|
||||
- **Description**: Creates a task with specified parameters and adds it to an existing job.
|
||||
- **Method**: `create_task(job_id, task_type, task_params)`
|
||||
- **Parameters**:
|
||||
- `job_id` (uuid.UUID | str): The unique identifier of the job.
|
||||
- `task_type` (TaskType): The type of the task.
|
||||
- `task_params` (dict, optional): Parameters for the task.
|
||||
- **Raises**:
|
||||
- `ValueError`: If the job does not exist or if an attempt is made to modify a job after its submission.
|
||||
|
||||
## CLI Tool
|
||||
|
||||
After installation, you can use the `nv-ingest-cli` tool from the command line to manage and process datasets.
|
||||
|
||||
### CLI Options
|
||||
|
||||
Here are the options provided by the CLI, explained:
|
||||
|
||||
- `--batch_size`: Specifies the number of documents to process in a single batch. Default is 10. Must be 1 or more.
|
||||
- `--doc`: Adds a new document to be processed. Supports multiple entries. Files must exist.
|
||||
- `--dataset`: Specifies the path to a dataset definition file.
|
||||
- `--client`: Sets the client type with choices including REST, Redis, Kafka. Default is Redis.
|
||||
- `--client_host`: Specifies the DNS name or URL for the endpoint.
|
||||
- `--client_port`: Sets the port number for the client endpoint.
|
||||
- `--client_kwargs`: Provides additional arguments to pass to the client. Default is `{}`.
|
||||
- `--concurrency_n`: Defines the number of inflight jobs to maintain at one time. Default is 1.
|
||||
- `--dry_run`: Enables a dry run without executing actions.
|
||||
- `--output_directory`: Specifies the output directory for results.
|
||||
- `--log_level`: Sets the log level. Choices are DEBUG, INFO, WARNING, ERROR, CRITICAL. Default is INFO.
|
||||
- `--shuffle_dataset`: Shuffles the dataset before processing if enabled. Default is true.
|
||||
- `--task`: Allows for specification of tasks in JSON format. Supports multiple tasks.
|
||||
|
||||
## Examples
|
||||
|
||||
Examples of using the CLI tool will be provided here, showing how to execute different tasks.
|
||||
|
||||
## Configuration
|
||||
|
||||
Details on how to configure the client and customize the behavior of the NV-Ingest-Client.
|
||||
|
||||
## Contributing
|
||||
|
||||
Information on how to contribute to the development of NV-Ingest-Client.
|
||||
|
||||
## License
|
||||
|
||||
NVIDIA Proprietary
|
||||
202
client/examples/sample_job.py
Normal file
202
client/examples/sample_job.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
"""
|
||||
Sample client application
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import as_completed
|
||||
|
||||
import click
|
||||
from nv_ingest_client.client import NvIngestClient
|
||||
from nv_ingest_client.primitives import JobSpec
|
||||
from nv_ingest_client.primitives.tasks import ExtractTask
|
||||
from nv_ingest_client.primitives.tasks import SplitTask
|
||||
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
||||
|
||||
logger = logging.getLogger("nv_ingest_client")
|
||||
|
||||
# redis config
|
||||
_DEFAULT_REDIS_HOST = "localhost"
|
||||
_DEFAULT_REDIS_PORT = 6379
|
||||
|
||||
# job config
|
||||
_DEFAULT_TASK_QUEUE = "morpheus_task_queue"
|
||||
_DEFAULT_JOB_TIMEOUT = 90
|
||||
|
||||
# split config
|
||||
_DEFAULT_SPLIT_BY = "sentence"
|
||||
_DEFAULT_SPLIT_LENGTH = 4
|
||||
_DEFAULT_SPLIT_OVERLAP = 1
|
||||
_DEFAULT_SPLIT_MAX_CHARACTER_LENGTH = 1900
|
||||
_DEFAULT_SPLIT_SENTENCE_WINDOW_SIZE = 0
|
||||
|
||||
# extract config
|
||||
_DEFAULT_EXTRACT_METHOD = "pdfium"
|
||||
|
||||
|
||||
# Note: You will need to deploy the nv-ingest service for this example to work.
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--file-name", help="Path to the file to process.")
|
||||
def _submit_simple(file_name):
|
||||
"""
|
||||
Creates a job_spec with a task of each type and submits to the nv_ingest_service.
|
||||
|
||||
:param file_name: Path to the file to be processed.
|
||||
"""
|
||||
client = NvIngestClient()
|
||||
|
||||
file_content, file_type = extract_file_content(file_name)
|
||||
|
||||
#######################################
|
||||
# Create an empty job directly on #
|
||||
#######################################
|
||||
job_id = client.create_job(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
client.submit_job(job_id, "morpheus_task_queue")
|
||||
result = client.fetch_job_result(job_id)
|
||||
print(f"Got {len(result)} results")
|
||||
|
||||
# Get back the same data that was sent, but wrapped in metadata, content type will be listed as 'structured'
|
||||
# print(result['data'])
|
||||
|
||||
########################################################
|
||||
# Create empty job externally and add it to the client #
|
||||
########################################################
|
||||
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
job_id = client.add_job(job_spec)
|
||||
client.submit_job(job_id, "morpheus_task_queue")
|
||||
|
||||
result = client.fetch_job_result(job_id)
|
||||
print(f"Got {len(result)} results")
|
||||
|
||||
###############################################################
|
||||
# Create extract only job externally and add it to the client #
|
||||
###############################################################
|
||||
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
extract_task = ExtractTask(
|
||||
document_type=file_type,
|
||||
extract_text=True,
|
||||
extract_images=True,
|
||||
)
|
||||
|
||||
job_spec.add_task(extract_task)
|
||||
job_id = client.add_job(job_spec)
|
||||
|
||||
client.submit_job(job_id, "morpheus_task_queue")
|
||||
|
||||
result = client.fetch_job_result(job_id)
|
||||
# Get back the extracted pdf data, for 'test.pdf' this will be a text and image artifact.
|
||||
print(f"Got {len(result)} results")
|
||||
|
||||
####################################################################
|
||||
# Create extract and split job externally and add it to the client #
|
||||
####################################################################
|
||||
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
extract_task = ExtractTask(
|
||||
document_type=file_type,
|
||||
extract_text=True,
|
||||
extract_images=True,
|
||||
)
|
||||
|
||||
split_task = SplitTask(
|
||||
split_by=_DEFAULT_SPLIT_BY,
|
||||
split_length=_DEFAULT_SPLIT_LENGTH,
|
||||
split_overlap=_DEFAULT_SPLIT_OVERLAP,
|
||||
max_character_length=_DEFAULT_SPLIT_MAX_CHARACTER_LENGTH,
|
||||
sentence_window_size=_DEFAULT_SPLIT_SENTENCE_WINDOW_SIZE,
|
||||
)
|
||||
|
||||
job_spec.add_task(extract_task)
|
||||
job_spec.add_task(split_task)
|
||||
job_id = client.add_job(job_spec)
|
||||
|
||||
client.submit_job(job_id, "morpheus_task_queue")
|
||||
|
||||
result = client.fetch_job_result(job_id)
|
||||
# Get back the extracted pdf data
|
||||
print(f"Got {len(result)} results")
|
||||
|
||||
########################################################
|
||||
# Create set of jobs, submit and retrieve all at once #
|
||||
########################################################
|
||||
|
||||
job_ids = []
|
||||
for _ in range(10):
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content[0],
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
extract_task = ExtractTask(
|
||||
document_type=file_type,
|
||||
extract_method=_DEFAULT_EXTRACT_METHOD,
|
||||
extract_text=True,
|
||||
extract_images=True,
|
||||
)
|
||||
|
||||
split_task = SplitTask(
|
||||
split_by=_DEFAULT_SPLIT_BY,
|
||||
split_length=_DEFAULT_SPLIT_LENGTH,
|
||||
split_overlap=_DEFAULT_SPLIT_OVERLAP,
|
||||
max_character_length=_DEFAULT_SPLIT_MAX_CHARACTER_LENGTH,
|
||||
sentence_window_size=_DEFAULT_SPLIT_SENTENCE_WINDOW_SIZE,
|
||||
)
|
||||
|
||||
job_spec.add_task(split_task)
|
||||
job_spec.add_task(extract_task)
|
||||
|
||||
job_ids.append(client.add_job(job_spec))
|
||||
|
||||
submit_futures = client.submit_job_async(job_ids, "morpheus_task_queue")
|
||||
for _ in as_completed(submit_futures):
|
||||
pass
|
||||
|
||||
print(f"Jobs {job_ids} submitted successfully.")
|
||||
fetch_futures = client.fetch_job_result_async(job_ids, timeout=60)
|
||||
for future in as_completed(fetch_futures):
|
||||
result = future.result()
|
||||
print(f"Got {len(result)} results")
|
||||
|
||||
|
||||
# Updated to use the Click library for command line parsing
|
||||
if __name__ == "__main__":
|
||||
_submit_simple()
|
||||
10
client/requirements.txt
Normal file
10
client/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
charset-normalizer
|
||||
pydantic
|
||||
python-magic
|
||||
redis~=5.0.1
|
||||
setuptools
|
||||
click
|
||||
pypdfium2
|
||||
python-docx
|
||||
python-pptx==0.6.23
|
||||
tqdm
|
||||
70
client/setup.py
Normal file
70
client/setup.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
|
||||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
# TODO(Devin): This is duplicated in nv_ingest's setup.py, should be moved to common once Jermey's PR is merged
|
||||
def get_version():
|
||||
release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev")
|
||||
version = os.getenv("NV_INGEST_CLIENT_VERSION")
|
||||
rev = os.getenv("NV_INGEST_REV", "0")
|
||||
|
||||
if not version:
|
||||
version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
|
||||
|
||||
# Ensure the version is PEP 440 compatible
|
||||
pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
|
||||
if not re.match(pep440_regex, version):
|
||||
raise ValueError(f"Version '{version}' is not PEP 440 compatible")
|
||||
|
||||
# Construct the final version string
|
||||
if release_type == "dev":
|
||||
final_version = f"{version}.dev{rev}"
|
||||
elif release_type == "release":
|
||||
final_version = f"{version}.post{rev}" if int(rev) > 0 else version
|
||||
else:
|
||||
raise ValueError(f"Invalid release type: {release_type}")
|
||||
|
||||
return final_version
|
||||
|
||||
|
||||
def read_requirements(file_name):
|
||||
"""Read a requirements file and return a list of its packages."""
|
||||
with open(file_name) as f:
|
||||
return f.read().splitlines()
|
||||
|
||||
|
||||
# Specify your requirements files
|
||||
requirements_files = [
|
||||
"requirements.txt",
|
||||
]
|
||||
|
||||
# Read and combine requirements from all specified files
|
||||
combined_requirements = []
|
||||
for file in requirements_files:
|
||||
combined_requirements.extend(read_requirements(file))
|
||||
|
||||
combined_requirements = list(set(combined_requirements))
|
||||
|
||||
setup(
|
||||
author="Anuradha Karuppiah",
|
||||
author_email="anuradhak@nvidia.com",
|
||||
classifiers=[],
|
||||
description="Python client for the nv-ingest service",
|
||||
entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main"]},
|
||||
install_requires=combined_requirements,
|
||||
name="nv_ingest_client",
|
||||
package_dir={"": "src"},
|
||||
packages=find_packages(where="src"),
|
||||
python_requires=">=3.10",
|
||||
version=get_version(),
|
||||
license="Apache-2.0",
|
||||
)
|
||||
0
client/src/nv_ingest_client/__init__.py
Normal file
0
client/src/nv_ingest_client/__init__.py
Normal file
0
client/src/nv_ingest_client/cli/__init__.py
Normal file
0
client/src/nv_ingest_client/cli/__init__.py
Normal file
0
client/src/nv_ingest_client/cli/util/__init__.py
Normal file
0
client/src/nv_ingest_client/cli/util/__init__.py
Normal file
247
client/src/nv_ingest_client/cli/util/click.py
Normal file
247
client/src/nv_ingest_client/cli/util/click.py
Normal file
@@ -0,0 +1,247 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from enum import Enum
|
||||
from pprint import pprint
|
||||
|
||||
import click
|
||||
from nv_ingest_client.cli.util.processing import check_schema
|
||||
from nv_ingest_client.primitives.tasks import CaptionTask
|
||||
from nv_ingest_client.primitives.tasks import DedupTask
|
||||
from nv_ingest_client.primitives.tasks import EmbedTask
|
||||
from nv_ingest_client.primitives.tasks import ExtractTask
|
||||
from nv_ingest_client.primitives.tasks import FilterTask
|
||||
from nv_ingest_client.primitives.tasks import SplitTask
|
||||
from nv_ingest_client.primitives.tasks import StoreTask
|
||||
from nv_ingest_client.primitives.tasks import VdbUploadTask
|
||||
from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
|
||||
from nv_ingest_client.primitives.tasks.vdb_upload import VdbUploadTaskSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogLevel(str, Enum):
|
||||
DEBUG = "DEBUG"
|
||||
INFO = "INFO"
|
||||
WARNING = "WARNING"
|
||||
ERROR = "ERROR"
|
||||
CRITICAL = "CRITICAL"
|
||||
|
||||
|
||||
class ClientType(str, Enum):
|
||||
REST = "REST"
|
||||
REDIS = "REDIS"
|
||||
KAFKA = "KAFKA"
|
||||
|
||||
|
||||
# Example TaskId validation set
|
||||
VALID_TASK_IDS = {"task1", "task2", "task3"}
|
||||
|
||||
_MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
|
||||
|
||||
|
||||
def debug_print_click_options(ctx):
|
||||
"""
|
||||
Retrieves all options from the Click context and pretty prints them.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ctx : click.Context
|
||||
The Click context object from which to retrieve the command options.
|
||||
"""
|
||||
click_options = {}
|
||||
for param in ctx.command.params:
|
||||
if isinstance(param, click.Option):
|
||||
value = ctx.params[param.name]
|
||||
click_options[param.name] = value
|
||||
|
||||
pprint(click_options)
|
||||
|
||||
|
||||
def click_validate_file_exists(ctx, param, value):
|
||||
if not value:
|
||||
return []
|
||||
|
||||
if isinstance(value, str):
|
||||
value = [value]
|
||||
else:
|
||||
value = list(value)
|
||||
|
||||
for filepath in value:
|
||||
if not os.path.exists(filepath):
|
||||
raise click.BadParameter(f"File does not exist: {filepath}")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def click_validate_task(ctx, param, value):
|
||||
validated_tasks = {}
|
||||
validation_errors = []
|
||||
|
||||
for task_str in value:
|
||||
task_split = task_str.split(":", 1)
|
||||
if len(task_split) != 2:
|
||||
task_id, json_options = task_str, "{}"
|
||||
else:
|
||||
task_id, json_options = task_split
|
||||
|
||||
try:
|
||||
options = json.loads(json_options)
|
||||
|
||||
if task_id == "split":
|
||||
task_options = check_schema(SplitTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = SplitTask(**task_options.dict())
|
||||
elif task_id == "extract":
|
||||
task_options = check_schema(ExtractTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}_{task_options.document_type}"
|
||||
new_task = ExtractTask(**task_options.dict())
|
||||
elif task_id == "store":
|
||||
task_options = check_schema(StoreTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = StoreTask(**task_options.dict())
|
||||
elif task_id == "caption":
|
||||
task_options = check_schema(CaptionTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = CaptionTask(**task_options.dict())
|
||||
elif task_id == "dedup":
|
||||
task_options = check_schema(DedupTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = DedupTask(**task_options.dict())
|
||||
elif task_id == "filter":
|
||||
task_options = check_schema(FilterTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = FilterTask(**task_options.dict())
|
||||
elif task_id == "embed":
|
||||
task_options = check_schema(EmbedTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = EmbedTask(**task_options.dict())
|
||||
elif task_id == "vdb_upload":
|
||||
task_options = check_schema(VdbUploadTaskSchema, options, task_id, json_options)
|
||||
new_task_id = f"{task_id}"
|
||||
new_task = VdbUploadTask(**task_options.dict())
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported task type: {task_id}")
|
||||
|
||||
logger.debug("Adding task: %s", new_task_id)
|
||||
validated_tasks[new_task_id] = new_task
|
||||
except ValueError as e:
|
||||
validation_errors.append(str(e))
|
||||
|
||||
if validation_errors:
|
||||
# Aggregate error messages with original values highlighted
|
||||
error_message = "\n".join(validation_errors)
|
||||
# logger.error(error_message)
|
||||
raise click.BadParameter(error_message)
|
||||
|
||||
return validated_tasks
|
||||
|
||||
|
||||
def click_validate_batch_size(ctx, param, value):
|
||||
if value < 1:
|
||||
raise click.BadParameter("Batch size must be >= 1.")
|
||||
return value
|
||||
|
||||
|
||||
def pre_process_dataset(dataset_json: str, shuffle_dataset: bool):
|
||||
"""
|
||||
Loads a dataset from a JSON file and optionally shuffles the list of files contained within.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_json : str
|
||||
The path to the dataset JSON file.
|
||||
shuffle_dataset : bool, optional
|
||||
Whether to shuffle the dataset before processing. Defaults to True.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
The list of files from the dataset, possibly shuffled.
|
||||
"""
|
||||
try:
|
||||
with open(dataset_json, "r") as f:
|
||||
file_source = json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise click.BadParameter(f"Dataset JSON file not found: {dataset_json}")
|
||||
except json.JSONDecodeError:
|
||||
raise click.BadParameter(f"Invalid JSON format in file: {dataset_json}")
|
||||
|
||||
# Extract the list of files and optionally shuffle them
|
||||
file_source = file_source.get("sampled_files", [])
|
||||
|
||||
if shuffle_dataset:
|
||||
random.shuffle(file_source)
|
||||
|
||||
return file_source
|
||||
|
||||
|
||||
def _generate_matching_files(file_sources):
|
||||
"""
|
||||
Generates a list of file paths that match the given patterns specified in file_sources.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_sources : list of str
|
||||
A list containing the file source patterns to match against.
|
||||
|
||||
Returns
|
||||
-------
|
||||
generator
|
||||
A generator yielding paths to files that match the specified patterns.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function utilizes glob pattern matching to find files that match the specified patterns.
|
||||
It yields each matching file path, allowing for efficient processing of potentially large
|
||||
sets of files.
|
||||
"""
|
||||
|
||||
files = [
|
||||
file_path
|
||||
for pattern in file_sources
|
||||
for file_path in glob.glob(pattern, recursive=True)
|
||||
if os.path.isfile(file_path)
|
||||
]
|
||||
for file_path in files:
|
||||
yield file_path
|
||||
|
||||
|
||||
def click_match_and_validate_files(ctx, param, value):
|
||||
"""
|
||||
Matches and validates files based on the provided file source patterns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : list of str
|
||||
A list containing file source patterns to match against.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of str or None
|
||||
A list of matching file paths if any matches are found; otherwise, None.
|
||||
"""
|
||||
|
||||
if not value:
|
||||
return []
|
||||
|
||||
matching_files = list(_generate_matching_files(value))
|
||||
if not matching_files:
|
||||
logger.warning("No files found matching the specified patterns.")
|
||||
return []
|
||||
|
||||
return matching_files
|
||||
101
client/src/nv_ingest_client/cli/util/dataset.py
Normal file
101
client/src/nv_ingest_client/cli/util/dataset.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from collections import Counter
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
from pprint import pformat
|
||||
|
||||
|
||||
def get_dataset_statistics(dataset_bytes: BytesIO) -> str:
|
||||
"""
|
||||
Reads a dataset specification from a BytesIO object, computes statistics about the dataset,
|
||||
and returns a formatted string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_bytes : BytesIO
|
||||
The BytesIO object containing the dataset in JSON format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A formatted string containing statistics about the dataset.
|
||||
"""
|
||||
try:
|
||||
dataset_bytes.seek(0)
|
||||
dataset = json.load(dataset_bytes)
|
||||
except json.JSONDecodeError:
|
||||
raise
|
||||
|
||||
sampled_files = dataset.get("sampled_files", [])
|
||||
metadata = dataset.get("metadata", {})
|
||||
|
||||
# Compute statistics
|
||||
file_types = [os.path.splitext(file)[1][1:].lower() for file in sampled_files]
|
||||
file_type_counts = Counter(file_types)
|
||||
unique_files = set(sampled_files)
|
||||
unique_file_types = {
|
||||
file_type: len(set(f for f in sampled_files if f.endswith("." + file_type))) for file_type in file_type_counts
|
||||
}
|
||||
|
||||
total_size_bytes = sum(os.path.getsize(f) for f in sampled_files)
|
||||
total_size_gb = total_size_bytes / (1024**3)
|
||||
|
||||
file_type_sizes = {
|
||||
ftype: sum(os.path.getsize(f) for f in sampled_files if f.endswith("." + ftype)) for ftype in file_type_counts
|
||||
}
|
||||
file_type_sizes_gb = {ftype: size / (1024**3) for ftype, size in file_type_sizes.items()}
|
||||
|
||||
estimated_sizes_gb = {
|
||||
ftype: metadata["file_type_proportions"][ftype]["target_proportion"] / 100 * total_size_gb
|
||||
for ftype in metadata["file_type_proportions"]
|
||||
}
|
||||
|
||||
# Format statistics as a string
|
||||
stats_stringio = StringIO()
|
||||
stats = {
|
||||
"metadata": metadata,
|
||||
"total_number_of_files": len(sampled_files),
|
||||
"total_number_of_unique_files": len(unique_files),
|
||||
"total_number_of_files_per_file_type": file_type_counts,
|
||||
"total_number_of_unique_files_per_file_type": unique_file_types,
|
||||
"total_size_gb": total_size_gb,
|
||||
"total_size_per_file_type_gb": file_type_sizes_gb,
|
||||
"estimated_total_size_per_file_type_gb": estimated_sizes_gb,
|
||||
}
|
||||
stats_stringio.write("Dataset Statistics:\n")
|
||||
stats_stringio.write(pformat(stats))
|
||||
|
||||
return stats_stringio.getvalue()
|
||||
|
||||
|
||||
def get_dataset_files(dataset_bytes: BytesIO, shuffle: bool = False) -> list:
|
||||
"""
|
||||
Extracts and optionally shuffles the list of files contained in a dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_bytes : BytesIO
|
||||
The BytesIO object containing the dataset in JSON format.
|
||||
shuffle : bool, optional
|
||||
Whether to shuffle the list of files before returning. Defaults to False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
The list of files from the dataset, possibly shuffled.
|
||||
"""
|
||||
try:
|
||||
dataset_bytes.seek(0)
|
||||
dataset = json.load(dataset_bytes)
|
||||
sampled_files = dataset.get("sampled_files", [])
|
||||
if shuffle:
|
||||
random.shuffle(sampled_files)
|
||||
return sampled_files
|
||||
except json.JSONDecodeError as err:
|
||||
raise ValueError(f"{err}")
|
||||
453
client/src/nv_ingest_client/cli/util/processing.py
Normal file
453
client/src/nv_ingest_client/cli/util/processing.py
Normal file
@@ -0,0 +1,453 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import as_completed
|
||||
from statistics import mean
|
||||
from statistics import median
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Type
|
||||
|
||||
from click import style
|
||||
from nv_ingest_client.client import NvIngestClient
|
||||
from nv_ingest_client.primitives import JobSpec
|
||||
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
||||
from nv_ingest_client.util.util import check_ingest_result
|
||||
from nv_ingest_client.util.util import estimate_page_count
|
||||
from pydantic import BaseModel
|
||||
from pydantic import ValidationError
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def highlight_error_in_original(original_str: str, task_name: str, error_detail: dict) -> str:
|
||||
"""
|
||||
Directly highlights the error-causing text in the original JSON string based on the error type.
|
||||
For 'extra fields' errors, it attempts to colorize the specific field name in the original string.
|
||||
For 'missing fields', it appends a clear message indicating the missing field.
|
||||
"""
|
||||
error_type = error_detail["type"]
|
||||
error_location = "->".join(map(str, error_detail["loc"]))
|
||||
if error_type == "value_error.extra":
|
||||
error_key = error_detail["loc"][-1]
|
||||
highlighted_key = style(error_key, fg="blue", bold=True)
|
||||
highlighted_str = original_str.replace(f'"{error_key}"', highlighted_key)
|
||||
elif error_type in ["value_error.missing", "value_error.any_str.min_length"]:
|
||||
missing_message = style(f"'{error_location}'", fg="blue", bold=True)
|
||||
highlighted_str = (
|
||||
f"{original_str}\n(Schema Error): Missing required parameter for task '{task_name}'"
|
||||
f" {missing_message}\n -> {original_str}"
|
||||
)
|
||||
else:
|
||||
error_key = error_detail["loc"][-1]
|
||||
highlighted_key = style(error_key, fg="blue", bold=True)
|
||||
highlighted_str = original_str.replace(f'"{error_key}"', highlighted_key)
|
||||
|
||||
return highlighted_str
|
||||
|
||||
|
||||
def format_validation_error(e: ValidationError, task_id, original_str: str) -> str:
|
||||
"""
|
||||
Formats validation errors with appropriate highlights and returns a detailed error message.
|
||||
"""
|
||||
error_messages = []
|
||||
for error in e.errors():
|
||||
error_message = f"(Schema Error): {error['msg']}"
|
||||
highlighted_str = highlight_error_in_original(original_str, task_id, error)
|
||||
error_messages.append(f"{error_message}\n -> {highlighted_str}")
|
||||
|
||||
return "\n".join(error_messages)
|
||||
|
||||
|
||||
def check_schema(schema: Type[BaseModel], options: dict, task_id: str, original_str: str) -> BaseModel:
|
||||
try:
|
||||
return schema(**options)
|
||||
except ValidationError as e:
|
||||
error_message = format_validation_error(e, task_id, original_str)
|
||||
# logger.error(error_message)
|
||||
raise ValueError(error_message) from e
|
||||
|
||||
|
||||
def report_stage_statistics(
|
||||
stage_elapsed_times: defaultdict(list), total_trace_elapsed: float, abs_elapsed: float
|
||||
) -> None:
|
||||
"""
|
||||
Reports the statistics for each processing stage, including average, median, total time spent,
|
||||
and their respective percentages of the total processing time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stage_elapsed_times : defaultdict(list)
|
||||
A defaultdict containing lists of elapsed times for each processing stage, in nanoseconds.
|
||||
total_trace_elapsed : float
|
||||
The total elapsed time across all processing stages, in nanoseconds.
|
||||
abs_elapsed : float
|
||||
The absolute elapsed time from the start to the end of processing, in nanoseconds.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function logs the average, median, and total time for each stage, along with the percentage of total
|
||||
computation.
|
||||
It also calculates and logs the unresolved time, if any, that is not accounted for by the recorded stages.
|
||||
"""
|
||||
|
||||
for stage, times in stage_elapsed_times.items():
|
||||
if times:
|
||||
avg_time = mean(times)
|
||||
med_time = median(times)
|
||||
total_stage_time = sum(times)
|
||||
percent_of_total = (total_stage_time / total_trace_elapsed * 100) if total_trace_elapsed > 0 else 0
|
||||
logger.info(
|
||||
f"{stage}: Avg: {avg_time / 1e6:.2f} ms, Median: {med_time / 1e6:.2f} ms, "
|
||||
f"Total Time: {total_stage_time / 1e6:.2f} ms, Total % of Trace Computation: {percent_of_total:.2f}%"
|
||||
)
|
||||
|
||||
unresolved_time = abs_elapsed - total_trace_elapsed
|
||||
if unresolved_time > 0:
|
||||
percent_unresolved = unresolved_time / abs_elapsed * 100
|
||||
logger.info(
|
||||
f"Unresolved time: {unresolved_time / 1e6:.2f} ms, Percent of Total Elapsed: {percent_unresolved:.2f}%"
|
||||
)
|
||||
else:
|
||||
logger.info("No unresolved time detected. Trace times account for the entire elapsed duration.")
|
||||
|
||||
|
||||
def report_overall_speed(total_pages_processed: int, start_time_ns: int, total_files: int) -> None:
|
||||
"""
|
||||
Reports the overall processing speed based on the number of pages and files processed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
total_pages_processed : int
|
||||
The total number of pages processed.
|
||||
start_time_ns : int
|
||||
The nanosecond timestamp marking the start of processing.
|
||||
total_files : int
|
||||
The total number of files processed.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function calculates the total elapsed time from the start of processing and reports the throughput
|
||||
in terms of pages and files processed per second.
|
||||
"""
|
||||
|
||||
total_elapsed_time_ns = time.time_ns() - start_time_ns
|
||||
total_elapsed_time_s = total_elapsed_time_ns / 1_000_000_000 # Convert nanoseconds to seconds
|
||||
|
||||
throughput_pages = total_pages_processed / total_elapsed_time_s # pages/sec
|
||||
throughput_files = total_files / total_elapsed_time_s # files/sec
|
||||
|
||||
logger.info(f"Processed {total_files} files in {total_elapsed_time_s:.2f} seconds.")
|
||||
logger.info(f"Total pages processed: {total_pages_processed}")
|
||||
logger.info(f"Throughput (Pages/sec): {throughput_pages:.2f}")
|
||||
logger.info(f"Throughput (Files/sec): {throughput_files:.2f}")
|
||||
|
||||
|
||||
def report_statistics(
|
||||
start_time_ns: int,
|
||||
stage_elapsed_times: defaultdict,
|
||||
total_pages_processed: int,
|
||||
total_files: int,
|
||||
total_timeouts: int,
|
||||
) -> None:
|
||||
"""
|
||||
Aggregates and reports statistics for the entire processing session.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_time_ns : int
|
||||
The nanosecond timestamp marking the start of the processing.
|
||||
stage_elapsed_times : defaultdict(list)
|
||||
A defaultdict where each key is a processing stage and each value is a list
|
||||
of elapsed times in nanoseconds for that stage.
|
||||
total_pages_processed : int
|
||||
The total number of pages processed during the session.
|
||||
total_files : int
|
||||
The total number of files processed during the session.
|
||||
total_timeouts : int
|
||||
The total number of timeouts that occurred during processing.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function calculates the absolute elapsed time from the start of processing to the current
|
||||
time and the total time taken by all stages.
|
||||
"""
|
||||
|
||||
abs_elapsed = time.time_ns() - start_time_ns
|
||||
total_trace_elapsed = sum(sum(times) for times in stage_elapsed_times.values())
|
||||
report_stage_statistics(stage_elapsed_times, total_trace_elapsed, abs_elapsed)
|
||||
report_overall_speed(total_pages_processed, start_time_ns, total_files)
|
||||
logger.info(f"Total timeouts: {total_timeouts}")
|
||||
|
||||
|
||||
def process_response(response, stage_elapsed_times):
|
||||
"""
|
||||
Process the response to extract trace data and calculate elapsed time for each stage.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
response : dict
|
||||
The response dictionary containing trace information for processing stages.
|
||||
stage_elapsed_times : defaultdict(list)
|
||||
A defaultdict to accumulate elapsed times for each processing stage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The function iterates over trace data in the response, identifying entry and exit times for
|
||||
each stage, and calculates the elapsed time which is then appended to the respective stage in
|
||||
`stage_elapsed_times`.
|
||||
"""
|
||||
|
||||
trace_data = response.get("trace", {})
|
||||
for key, entry_time in trace_data.items():
|
||||
if "entry" in key:
|
||||
exit_key = key.replace("entry", "exit")
|
||||
exit_time = trace_data.get(exit_key)
|
||||
if exit_time:
|
||||
stage_name = key.split("::")[2]
|
||||
elapsed_time = exit_time - entry_time
|
||||
stage_elapsed_times[stage_name].append(elapsed_time)
|
||||
|
||||
|
||||
def organize_documents_by_type(response_data):
|
||||
doc_map = {}
|
||||
for document in response_data:
|
||||
doc_meta = document["metadata"]
|
||||
# TODO: fix this. doc_meta can be a json string or a dict.
|
||||
if isinstance(doc_meta, str):
|
||||
doc_meta = json.loads(doc_meta)
|
||||
doc_content_metadata = doc_meta["content_metadata"]
|
||||
doc_type = doc_content_metadata["type"]
|
||||
if doc_type not in doc_map:
|
||||
doc_map[doc_type] = []
|
||||
doc_map[doc_type].append(document)
|
||||
return doc_map
|
||||
|
||||
|
||||
def save_response_data(response, output_directory):
|
||||
if ("data" not in response) or (not response["data"]):
|
||||
return
|
||||
|
||||
response_data = response["data"]
|
||||
|
||||
if not isinstance(response_data, list) or len(response_data) == 0:
|
||||
return
|
||||
|
||||
doc_meta_base = response_data[0]["metadata"]
|
||||
source_meta = doc_meta_base["source_metadata"]
|
||||
doc_name = source_meta["source_id"]
|
||||
clean_doc_name = get_valid_filename(os.path.basename(doc_name))
|
||||
output_name = f"{clean_doc_name}.metadata.json"
|
||||
|
||||
doc_map = organize_documents_by_type(response_data)
|
||||
for doc_type, documents in doc_map.items():
|
||||
doc_type_path = os.path.join(output_directory, doc_type)
|
||||
if not os.path.exists(doc_type_path):
|
||||
os.makedirs(doc_type_path)
|
||||
|
||||
with open(os.path.join(doc_type_path, output_name), "w") as f:
|
||||
f.write(json.dumps(documents, indent=2))
|
||||
|
||||
|
||||
def create_job_specs_for_batch(files_batch: List[str], tasks: Dict, client: NvIngestClient) -> List[str]:
|
||||
"""
|
||||
Creates JobSpecs for a batch of files and submits them, returning job IDs.
|
||||
"""
|
||||
job_ids = []
|
||||
for file_name in files_batch:
|
||||
try:
|
||||
file_content, file_type = extract_file_content(file_name) # Assume these are defined
|
||||
except ValueError as ve:
|
||||
logger.error(f"Error extracting content from {file_name}: {ve}")
|
||||
continue
|
||||
|
||||
job_spec = JobSpec(
|
||||
document_type=file_type,
|
||||
payload=file_content,
|
||||
source_id=file_name,
|
||||
source_name=file_name,
|
||||
extended_options={"tracing_options": {"trace": True, "ts_send": time.time_ns()}},
|
||||
)
|
||||
|
||||
logger.debug(f"Tasks: {tasks.keys()}")
|
||||
for task in tasks:
|
||||
logger.debug(f"Task: {task}")
|
||||
|
||||
# TODO(Devin): Formalize this later, don't have time right now.
|
||||
if "split" in tasks:
|
||||
job_spec.add_task(tasks["split"])
|
||||
|
||||
if f"extract_{file_type}" in tasks:
|
||||
job_spec.add_task(tasks[f"extract_{file_type}"])
|
||||
|
||||
if "store" in tasks:
|
||||
job_spec.add_task(tasks["store"])
|
||||
|
||||
if "caption" in tasks:
|
||||
job_spec.add_task(tasks["caption"])
|
||||
|
||||
if "dedup" in tasks:
|
||||
job_spec.add_task(tasks["dedup"])
|
||||
|
||||
if "filter" in tasks:
|
||||
job_spec.add_task(tasks["filter"])
|
||||
|
||||
if "embed" in tasks:
|
||||
job_spec.add_task(tasks["embed"])
|
||||
|
||||
if "vdb_upload" in tasks:
|
||||
job_spec.add_task(tasks["vdb_upload"])
|
||||
|
||||
job_id = client.add_job(job_spec)
|
||||
job_ids.append(job_id)
|
||||
|
||||
return job_ids
|
||||
|
||||
|
||||
# TODO(Devin): Circle back on this, we can refactor to be better at keeping as many jobs in-flight as possible.
|
||||
def create_and_process_jobs(
|
||||
files: List[str],
|
||||
client: NvIngestClient,
|
||||
tasks: Dict,
|
||||
output_directory: str,
|
||||
batch_size: int,
|
||||
timeout: int = 10,
|
||||
fail_on_error: bool = False,
|
||||
):
|
||||
"""
|
||||
Processes a list of files, creating and submitting jobs for each file, then fetching results.
|
||||
Manages retries for timeouts and logs failures for decoding errors.
|
||||
Limits the number of JobSpecs in memory to batch_size * 2. Progress is reported per file.
|
||||
"""
|
||||
total_files = len(files)
|
||||
total_pages_processed = 0
|
||||
total_timeouts = 0
|
||||
trace_times = defaultdict(list)
|
||||
failed_jobs = []
|
||||
retry_job_ids = []
|
||||
job_id_map = {}
|
||||
retry_counts = defaultdict(int)
|
||||
file_page_counts = {file: estimate_page_count(file) for file in files}
|
||||
|
||||
start_time_ns = time.time_ns()
|
||||
with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
|
||||
processed = 0
|
||||
while (processed < len(files)) or retry_job_ids:
|
||||
# Process new batch of files or retry failed job IDs
|
||||
job_ids = []
|
||||
cur_job_count = 0
|
||||
if retry_job_ids:
|
||||
# logger.info(f"Adding retry jobs: {[job_id_map[jid] for jid in retry_job_ids]}")
|
||||
job_ids.extend(retry_job_ids)
|
||||
cur_job_count = len(job_ids)
|
||||
retry_job_ids = [] # Clear retry list after assigning
|
||||
|
||||
if (cur_job_count < batch_size) and (processed < len(files)):
|
||||
new_job_count = min(batch_size - cur_job_count, len(files) - processed)
|
||||
batch_files = files[processed : processed + new_job_count] # noqa: E203
|
||||
|
||||
new_job_ids = create_job_specs_for_batch(batch_files, tasks, client)
|
||||
if len(new_job_ids) != new_job_count:
|
||||
missing_jobs = new_job_count - len(new_job_ids)
|
||||
error_msg = (
|
||||
f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
|
||||
)
|
||||
if fail_on_error:
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
logger.warning(error_msg)
|
||||
pbar.update(missing_jobs)
|
||||
|
||||
job_id_map.update({job_id: file for job_id, file in zip(new_job_ids, batch_files)})
|
||||
|
||||
processed += new_job_count
|
||||
_ = client.submit_job_async(new_job_ids, "morpheus_task_queue")
|
||||
job_ids.extend(new_job_ids)
|
||||
|
||||
futures_dict = client.fetch_job_result_async(job_ids, timeout=timeout, data_only=False)
|
||||
|
||||
for future in as_completed(futures_dict.keys()):
|
||||
retry = False
|
||||
job_id = futures_dict[future]
|
||||
try:
|
||||
result, _ = future.result()[0]
|
||||
if ("annotations" in result) and result["annotations"]:
|
||||
annotations = result["annotations"]
|
||||
for key, value in annotations.items():
|
||||
logger.debug(f"Annotation: {key} -> {json.dumps(value, indent=2)}")
|
||||
|
||||
valid_result, description = check_ingest_result(result)
|
||||
|
||||
if valid_result:
|
||||
raise RuntimeError(f"Failed to process job {job_id}: {description}")
|
||||
|
||||
source_name = job_id_map[job_id]
|
||||
|
||||
if output_directory:
|
||||
save_response_data(result, output_directory)
|
||||
|
||||
total_pages_processed += file_page_counts[source_name]
|
||||
elapsed_time = (time.time_ns() - start_time_ns) / 1e9
|
||||
pages_per_sec = total_pages_processed / elapsed_time if elapsed_time > 0 else 0
|
||||
pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
|
||||
|
||||
process_response(result, trace_times)
|
||||
except TimeoutError:
|
||||
source_name = job_id_map[job_id]
|
||||
retry_counts[source_name] += 1
|
||||
|
||||
# TODO(Devin): not sure if we actually want a retry limit; if we don't get an actual failure
|
||||
# condition just assume we should continue waiting.
|
||||
# if retry_counts[source_name] > 10:
|
||||
# logger.error(f"Timeout error for job {job_id} after {retry_counts[source_name]} retries.")
|
||||
# failed_jobs.append(f"{job_id}::{source_name}")
|
||||
# else:
|
||||
retry_job_ids.append(job_id) # Add job_id back to retry list
|
||||
total_timeouts += 1
|
||||
retry = True
|
||||
except json.JSONDecodeError as e:
|
||||
source_name = job_id_map[job_id]
|
||||
logger.error(f"Decoding error for job {job_id}::{source_name} {e}")
|
||||
failed_jobs.append(f"{job_id}::{source_name}")
|
||||
except RuntimeError as e:
|
||||
source_name = job_id_map[job_id]
|
||||
logger.error(f"Processing error was reported for {job_id}::{source_name} {e}")
|
||||
failed_jobs.append(f"{job_id}::{source_name}")
|
||||
except Exception as e:
|
||||
source_name = job_id_map[job_id]
|
||||
logger.error(f"Unhandled error occurred processing {job_id}:{source_name} {e}")
|
||||
failed_jobs.append(f"{job_id}::{source_name}")
|
||||
finally:
|
||||
if not retry:
|
||||
pbar.update(1)
|
||||
|
||||
if failed_jobs:
|
||||
logger.error(f"Failed jobs due to decoding or other errors: {failed_jobs}")
|
||||
|
||||
return total_files, trace_times, total_pages_processed, total_timeouts
|
||||
|
||||
|
||||
def get_valid_filename(name):
|
||||
"""
|
||||
Taken from https://github.com/django/django/blob/main/django/utils/text.py.
|
||||
Return the given string converted to a string that can be used for a clean
|
||||
filename. Remove leading and trailing spaces; convert other spaces to
|
||||
underscores; and remove anything that is not an alphanumeric, dash,
|
||||
underscore, or dot.
|
||||
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||
'johns_portrait_in_2004.jpg'
|
||||
"""
|
||||
s = str(name).strip().replace(" ", "_")
|
||||
s = re.sub(r"(?u)[^-\w.]", "", s)
|
||||
if s in {"", ".", ".."}:
|
||||
raise ValueError("Could not derive file name from '%s'" % name)
|
||||
return s
|
||||
97
client/src/nv_ingest_client/cli/util/system.py
Normal file
97
client/src/nv_ingest_client/cli/util/system.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
|
||||
|
||||
def configure_logging(logger, log_level: str):
|
||||
"""
|
||||
Configures the logging level based on a log_level string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
log_level : str
|
||||
The logging level as a string, expected to be one of
|
||||
'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
|
||||
"""
|
||||
level_dict = {
|
||||
"DEBUG": logging.DEBUG,
|
||||
"INFO": logging.INFO,
|
||||
"WARNING": logging.WARNING,
|
||||
"ERROR": logging.ERROR,
|
||||
"CRITICAL": logging.CRITICAL,
|
||||
}
|
||||
|
||||
# Convert the log level string to a logging level.
|
||||
numeric_level = level_dict.get(log_level.upper(), None)
|
||||
if numeric_level is None:
|
||||
raise ValueError(f"Invalid log level: {log_level}")
|
||||
|
||||
# Configure the logger to the specified level.
|
||||
logging.basicConfig(level=numeric_level)
|
||||
logger.setLevel(numeric_level)
|
||||
logger.debug(f"Logging configured to {log_level} level.")
|
||||
|
||||
|
||||
def has_permissions(path: str, read: bool = False, write: bool = False) -> bool:
|
||||
"""
|
||||
Checks if the current user has specified permissions on a path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
The filesystem path to check permissions on.
|
||||
read : bool, optional
|
||||
Whether to check for read permission.
|
||||
write : bool, optional
|
||||
Whether to check for write permission.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the path has the specified permissions, False otherwise.
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
|
||||
current_permissions = os.stat(path).st_mode
|
||||
has_read = not read or bool(current_permissions & stat.S_IRUSR)
|
||||
has_write = not write or bool(current_permissions & stat.S_IWUSR)
|
||||
|
||||
return has_read and has_write
|
||||
|
||||
|
||||
def ensure_directory_with_permissions(directory_path: str):
|
||||
"""
|
||||
Ensures that a directory exists and the current user has read/write permissions.
|
||||
If the directory does not exist, attempts to create it after checking the parent directory for write permission.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
directory_path : str
|
||||
The path to the directory to check or create.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the directory exists and has the correct permissions, or if it was successfully created.
|
||||
False if the directory cannot be created or does not have the correct permissions.
|
||||
"""
|
||||
if directory_path is None:
|
||||
return
|
||||
|
||||
try:
|
||||
if not os.path.exists(directory_path):
|
||||
parent_directory = os.path.dirname(directory_path)
|
||||
if not has_permissions(parent_directory, write=True):
|
||||
raise OSError(f"Parent directory {parent_directory} does not have write permissions")
|
||||
|
||||
os.makedirs(directory_path)
|
||||
|
||||
if not has_permissions(directory_path, read=True, write=True):
|
||||
raise OSError(f"Directory {directory_path} does not have read/write permissions")
|
||||
except OSError as err:
|
||||
raise OSError(f"Error checking or creating directory: {err}")
|
||||
0
client/src/nv_ingest_client/cli/util/tasks.py
Normal file
0
client/src/nv_ingest_client/cli/util/tasks.py
Normal file
7
client/src/nv_ingest_client/client/__init__.py
Normal file
7
client/src/nv_ingest_client/client/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .client import NvIngestClient
|
||||
|
||||
__all__ = ["NvIngestClient"]
|
||||
453
client/src/nv_ingest_client/client/client.py
Normal file
453
client/src/nv_ingest_client/client/client.py
Normal file
@@ -0,0 +1,453 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=broad-except
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from concurrent.futures import Future
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from concurrent.futures import as_completed
|
||||
from typing import Callable
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
from nv_ingest_client.message_clients.redis.redis_client import RedisClient
|
||||
from nv_ingest_client.primitives import JobSpec
|
||||
from nv_ingest_client.primitives.jobs import JobState
|
||||
from nv_ingest_client.primitives.jobs import JobStateEnum
|
||||
from nv_ingest_client.primitives.tasks import Task
|
||||
from nv_ingest_client.primitives.tasks import TaskType
|
||||
from nv_ingest_client.primitives.tasks import task_factory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataDecodeException(Exception):
|
||||
"""
|
||||
Exception raised for errors in decoding data.
|
||||
|
||||
Attributes:
|
||||
message -- explanation of the error
|
||||
data -- the data that failed to decode, optionally
|
||||
"""
|
||||
|
||||
def __init__(self, message="Data decoding error", data=None):
|
||||
self.message = message
|
||||
self.data = data
|
||||
super().__init__(f"{message}: {data}")
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}({self.message}, Data={self.data})"
|
||||
|
||||
|
||||
class NvIngestClient:
|
||||
"""
|
||||
A client class for interacting with the nv-ingest service, supporting custom client allocators.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message_client_allocator: Callable[..., RedisClient] = RedisClient,
|
||||
message_client_hostname: Optional[str] = "localhost",
|
||||
message_client_port: Optional[int] = 6379,
|
||||
message_client_kwargs: Optional[Dict] = None,
|
||||
msg_counter_id: Optional[str] = "nv-ingest-message-id",
|
||||
worker_pool_size: int = 1,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the NvIngestClient with a client allocator, Redis configuration, a message counter ID,
|
||||
and a worker pool for parallel processing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
message_client_allocator : Callable[..., RedisClient]
|
||||
A callable that when called returns an instance of the client used for communication.
|
||||
message_client_hostname : str, optional
|
||||
The hostname of the Redis server. Defaults to "localhost".
|
||||
message_client_port : int, optional
|
||||
The port number of the Redis server. Defaults to 6379.
|
||||
msg_counter_id : str, optional
|
||||
The Redis key for tracking message counts. Defaults to "nv-ingest-message-id".
|
||||
worker_pool_size : int, optional
|
||||
The number of worker processes in the pool. Defaults to 1.
|
||||
"""
|
||||
|
||||
self._job_states = {}
|
||||
self._message_client_hostname = message_client_hostname or "localhost"
|
||||
self._message_client_port = message_client_port or 6379
|
||||
self._message_counter_id = msg_counter_id or "nv-ingest-message-id"
|
||||
|
||||
logger.debug("Instantiate NvIngestClient:\n%s", str(self))
|
||||
self._message_client = message_client_allocator(
|
||||
host=self._message_client_hostname, port=self._message_client_port
|
||||
)
|
||||
|
||||
# Initialize the worker pool with the specified size
|
||||
self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
|
||||
|
||||
self._telemetry = {}
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string representation of the NvIngestClient configuration and runtime state.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A string representation of the client showing the Redis configuration.
|
||||
"""
|
||||
info = "NvIngestClient:\n"
|
||||
info += f" message_client_host: {self._message_client_hostname}\n"
|
||||
info += f" message_client_port: {self._message_client_port}\n"
|
||||
return info
|
||||
|
||||
def _generate_job_id(self) -> str:
|
||||
"""
|
||||
Generates a unique job ID by combining a UUID with an incremented value from Redis.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A unique job ID in the format of "<UUID>_<Redis incremented value>".
|
||||
"""
|
||||
uid = uuid.uuid4()
|
||||
redis_msg_id = self._message_client.get_client().incr(self._message_counter_id)
|
||||
|
||||
return f"{uid}_{redis_msg_id}"
|
||||
|
||||
def _pop_job_state(self, job_id: str) -> JobState:
|
||||
"""
|
||||
Deletes the job with the specified ID from the job tracking dictionary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
job_id : str
|
||||
The ID of the job to delete.
|
||||
"""
|
||||
|
||||
job_state = self._get_and_check_job_state(job_id)
|
||||
self._job_states.pop(job_id)
|
||||
|
||||
return job_state
|
||||
|
||||
def _get_and_check_job_state(
|
||||
self,
|
||||
job_id: str,
|
||||
required_state: Union[JobStateEnum, List[JobStateEnum]] = None,
|
||||
) -> JobState:
|
||||
if required_state and not isinstance(required_state, list):
|
||||
required_state = [required_state]
|
||||
|
||||
job_state = self._job_states.get(job_id)
|
||||
|
||||
if not job_state:
|
||||
raise ValueError(f"Job with ID {job_state} does not exist")
|
||||
if required_state and (job_state.state not in required_state):
|
||||
raise ValueError(
|
||||
f"Job with ID {job_state.job_id} has invalid state {job_state.state}, expected {required_state}"
|
||||
)
|
||||
|
||||
return job_state
|
||||
|
||||
def job_count(self):
|
||||
return len(self._job_states)
|
||||
|
||||
def add_job(self, job_spec: JobSpec = None):
|
||||
job_id = job_spec.job_id or self._generate_job_id()
|
||||
if job_id and job_id in self._job_states:
|
||||
raise ValueError(f"Cannot create Job with ID {job_id}: already exists")
|
||||
|
||||
job_spec.job_id = job_id
|
||||
self._job_states[job_id] = JobState(job_spec=job_spec)
|
||||
|
||||
return job_id
|
||||
|
||||
def create_job(
|
||||
self,
|
||||
payload: str,
|
||||
source_id: str,
|
||||
source_name: str,
|
||||
document_type: str = None,
|
||||
tasks: Optional[list] = None,
|
||||
job_id: Optional[Union[uuid.UUID, str]] = None,
|
||||
extended_options: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Creates a new job with the specified parameters and adds it to the job tracking dictionary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
job_id : uuid.UUID, optional
|
||||
The unique identifier for the job. If not provided, a new UUID will be generated.
|
||||
payload : dict
|
||||
The payload associated with the job. Defaults to an empty dictionary if not provided.
|
||||
tasks : list, optional
|
||||
A list of tasks to be associated with the job.
|
||||
document_type : str
|
||||
The type of document to be processed.
|
||||
source_id : str
|
||||
The source identifier for the job.
|
||||
source_name : str
|
||||
The unique name of the job's source data.
|
||||
extended_options : dict, optional
|
||||
Additional options for job creation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The job ID as a string.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If a job with the specified `job_id` already exists.
|
||||
"""
|
||||
|
||||
if job_id and job_id in self._job_states:
|
||||
raise ValueError(f"Cannot create Job with ID {job_id}: already exists")
|
||||
|
||||
document_type = document_type or source_name.split(".")[-1]
|
||||
job_id = str(job_id) if job_id else self._generate_job_id()
|
||||
job_spec = JobSpec(
|
||||
payload=payload or {},
|
||||
job_id=job_id,
|
||||
tasks=tasks,
|
||||
document_type=document_type,
|
||||
source_id=source_id,
|
||||
source_name=source_name,
|
||||
extended_options=extended_options,
|
||||
)
|
||||
|
||||
return self.add_job(job_spec)
|
||||
|
||||
def add_task(self, job_id: str, task: Task) -> None:
|
||||
job_state = self._get_and_check_job_state(job_id, required_state=JobStateEnum.PENDING)
|
||||
|
||||
job_state.job_spec.add_task(task)
|
||||
|
||||
def create_task(
|
||||
self,
|
||||
job_id: Union[uuid.UUID, str],
|
||||
task_type: TaskType,
|
||||
task_params: dict = None,
|
||||
) -> None:
|
||||
"""
|
||||
Creates a task of the specified type with given parameters and associates it with the existing job.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
job_id : Union[uuid.UUID, str]
|
||||
The unique identifier of the job to which the task will be added. This can be a UUID object or its string
|
||||
representation.
|
||||
task_type : TaskType
|
||||
The type of the task to be created, defined as an enum value.
|
||||
task_params : dict
|
||||
A dictionary containing parameters for the task.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the job with the specified `job_id` does not exist or if an attempt is made to modify a job after its
|
||||
submission.
|
||||
"""
|
||||
task_params = task_params or {}
|
||||
|
||||
return self.add_task(job_id, task_factory(task_type, **task_params))
|
||||
|
||||
def _fetch_job_result(self, job_id: str, timeout: float = 10, data_only: bool = True) -> Tuple[Dict, str]:
|
||||
"""
|
||||
Fetches the job result from a message client, handling potential errors and state changes.
|
||||
|
||||
Args:
|
||||
job_id (str): The identifier of the job.
|
||||
timeout (float): Timeout for the fetch operation in seconds.
|
||||
data_only (bool): If True, only returns the data part of the job result.
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, str]: The job result and the job ID.
|
||||
|
||||
Raises:
|
||||
ValueError: If there is an error in decoding the job result.
|
||||
TimeoutError: If the fetch operation times out.
|
||||
Exception: For all other unexpected issues.
|
||||
"""
|
||||
|
||||
try:
|
||||
job_state = self._get_and_check_job_state(job_id, required_state=[JobStateEnum.SUBMITTED])
|
||||
response = self._message_client.fetch_message(job_state.response_channel, timeout)
|
||||
|
||||
if response is not None:
|
||||
try:
|
||||
job_state.state = JobStateEnum.PROCESSING
|
||||
response_json = json.loads(response)
|
||||
if data_only:
|
||||
response_json = response_json["data"]
|
||||
|
||||
return response_json, job_id
|
||||
except json.JSONDecodeError as err:
|
||||
logger.error(f"Error decoding job result for job ID {job_id}: {err}")
|
||||
raise ValueError(f"Error decoding job result: {err}") from err
|
||||
finally:
|
||||
# Only pop once we know we've successfully decoded the response or errored out
|
||||
_ = self._pop_job_state(job_id)
|
||||
else:
|
||||
raise TimeoutError(f"Timeout: No response within {timeout} seconds for job ID {job_id}")
|
||||
|
||||
except TimeoutError:
|
||||
raise
|
||||
except Exception as err:
|
||||
logger.error(f"Unexpected error while fetching job result for job ID {job_id}: {err}")
|
||||
raise
|
||||
|
||||
def fetch_job_result(self, job_ids: Union[str, List[str]], timeout: float = 10, data_only: bool = True):
|
||||
if isinstance(job_ids, str):
|
||||
job_ids = [job_ids]
|
||||
|
||||
return [self._fetch_job_result(job_id, timeout, data_only) for job_id in job_ids]
|
||||
|
||||
def _ensure_submitted(self, job_ids: List[str]):
|
||||
if isinstance(job_ids, str):
|
||||
job_ids = [job_ids] # Ensure job_ids is always a list
|
||||
|
||||
submission_futures = {}
|
||||
for job_id in job_ids:
|
||||
job_state = self._get_and_check_job_state(
|
||||
job_id,
|
||||
required_state=[JobStateEnum.SUBMITTED, JobStateEnum.SUBMITTED_ASYNC],
|
||||
)
|
||||
if job_state.state == JobStateEnum.SUBMITTED_ASYNC:
|
||||
submission_futures[job_state.future] = job_state
|
||||
|
||||
for future in as_completed(submission_futures.keys()):
|
||||
job_state = submission_futures[future]
|
||||
job_state.state = JobStateEnum.SUBMITTED
|
||||
job_state.future = None
|
||||
|
||||
def fetch_job_result_async(
|
||||
self, job_ids: Union[str, List[str]], timeout: float = 10, data_only: bool = True
|
||||
) -> Dict[Future, str]:
|
||||
"""
|
||||
Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
|
||||
|
||||
Parameters:
|
||||
job_ids (Union[str, List[str]]): A single job ID or a list of job IDs.
|
||||
timeout (float): Timeout for fetching each job result, in seconds.
|
||||
data_only (bool): Whether to return only the data part of the job result.
|
||||
|
||||
Returns:
|
||||
Dict[Future, str]: A dictionary mapping each future to its corresponding job ID.
|
||||
"""
|
||||
if isinstance(job_ids, str):
|
||||
job_ids = [job_ids] # Ensure job_ids is always a list
|
||||
|
||||
# Make sure all jobs have actually been submitted before launching fetches.
|
||||
self._ensure_submitted(job_ids)
|
||||
|
||||
future_to_job_id = {}
|
||||
for job_id in job_ids:
|
||||
job_state = self._get_and_check_job_state(job_id)
|
||||
|
||||
future = self._worker_pool.submit(self.fetch_job_result, job_id, timeout, data_only)
|
||||
job_state.future = future
|
||||
future_to_job_id[future] = job_id
|
||||
|
||||
return future_to_job_id
|
||||
|
||||
def _submit_job(
|
||||
self,
|
||||
job_id: str,
|
||||
job_queue_id: str,
|
||||
) -> Optional[Dict]:
|
||||
"""
|
||||
Submits a job to a specified job queue and optionally waits for a response if blocking is True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
job_id : str
|
||||
The unique identifier of the job to be submitted.
|
||||
job_queue_id : str
|
||||
The ID of the job queue where the job will be submitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[Dict]
|
||||
The job result if blocking is True and a result is available before the timeout, otherwise None.
|
||||
|
||||
Raises
|
||||
------
|
||||
Exception
|
||||
If submitting the job fails.
|
||||
"""
|
||||
|
||||
job_state = self._get_and_check_job_state(
|
||||
job_id, required_state=[JobStateEnum.PENDING, JobStateEnum.SUBMITTED_ASYNC]
|
||||
)
|
||||
|
||||
job_spec_str = json.dumps(job_state.job_spec.to_dict())
|
||||
response_channel = f"response_{job_id}"
|
||||
|
||||
try:
|
||||
self._message_client.submit_message(job_queue_id, job_spec_str)
|
||||
job_state.response_channel = response_channel
|
||||
job_state.state = JobStateEnum.SUBMITTED
|
||||
# job_state.future = None
|
||||
|
||||
# Free up memory -- payload should never be used again, and we don't want to keep it around.
|
||||
job_state.job_spec.payload = None
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to submit job {job_id} to queue {job_queue_id}: {err}")
|
||||
job_state.state = JobStateEnum.FAILED
|
||||
raise
|
||||
|
||||
return None
|
||||
|
||||
def submit_job(self, job_ids: Union[str, List[str]], job_queue_id: str) -> List[Union[Dict, None]]:
|
||||
if isinstance(job_ids, str):
|
||||
job_ids = [job_ids]
|
||||
|
||||
return [self._submit_job(job_id, job_queue_id) for job_id in job_ids]
|
||||
|
||||
def submit_job_async(self, job_ids: Union[str, List[str]], job_queue_id: str) -> Dict[Future, str]:
|
||||
"""
|
||||
Asynchronously submits one or more jobs to a specified job queue using a thread pool.
|
||||
This method handles both single job ID or a list of job IDs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
job_ids : Union[str, List[str]]
|
||||
A single job ID or a list of job IDs to be submitted.
|
||||
job_queue_id : str
|
||||
The ID of the job queue where the jobs will be submitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict[Future, str]
|
||||
A dictionary mapping futures to their respective job IDs for later retrieval of outcomes.
|
||||
|
||||
Notes
|
||||
-----
|
||||
- This method queues the jobs for asynchronous submission and returns a mapping of futures to job IDs.
|
||||
- It does not wait for any of the jobs to complete.
|
||||
- Ensure that each job is in the proper state before submission.
|
||||
"""
|
||||
|
||||
if isinstance(job_ids, str):
|
||||
job_ids = [job_ids] # Convert single job_id to a list
|
||||
|
||||
future_to_job_id = {}
|
||||
for job_id in job_ids:
|
||||
job_state = self._get_and_check_job_state(job_id, JobStateEnum.PENDING)
|
||||
job_state.state = JobStateEnum.SUBMITTED_ASYNC
|
||||
|
||||
future = self._worker_pool.submit(self.submit_job, job_id, job_queue_id)
|
||||
job_state.future = future
|
||||
future_to_job_id[future] = job_id
|
||||
|
||||
return future_to_job_id
|
||||
7
client/src/nv_ingest_client/message_clients/__init__.py
Normal file
7
client/src/nv_ingest_client/message_clients/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .client_base import MessageClientBase
|
||||
|
||||
__all__ = ["MessageClientBase"]
|
||||
72
client/src/nv_ingest_client/message_clients/client_base.py
Normal file
72
client/src/nv_ingest_client/message_clients/client_base.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class MessageClientBase(ABC):
|
||||
"""
|
||||
Abstract base class for a messaging client to interface with various messaging systems.
|
||||
|
||||
Provides a standard interface for sending and receiving messages with connection management
|
||||
and retry logic.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
host: str,
|
||||
port: int,
|
||||
db: int = 0,
|
||||
max_retries: int = 0,
|
||||
max_backoff: int = 32,
|
||||
connection_timeout: int = 300,
|
||||
max_pool_size: int = 128,
|
||||
use_ssl: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize the messaging client with connection parameters.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_client(self):
|
||||
"""
|
||||
Returns the client instance, reconnecting if necessary.
|
||||
|
||||
Returns:
|
||||
The client instance.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def ping(self) -> bool:
|
||||
"""
|
||||
Checks if the server is responsive.
|
||||
|
||||
Returns:
|
||||
True if the server responds to a ping, False otherwise.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def fetch_message(self, channel_name: str, timeout: float = 0) -> str:
|
||||
"""
|
||||
Fetches a message from the specified queue with retries on failure.
|
||||
|
||||
Parameters:
|
||||
channel_name (str): The name of the task queue to fetch messages from.
|
||||
timeout (float): The timeout in seconds for blocking until a message is available.
|
||||
|
||||
Returns:
|
||||
The fetched message, or None if no message could be fetched.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def submit_message(self, channel_name: str, message: str):
|
||||
"""
|
||||
Submits a message to a specified queue with retries on failure.
|
||||
|
||||
Parameters:
|
||||
channel_name (str): The name of the queue to submit the message to.
|
||||
message (str): The message to submit.
|
||||
"""
|
||||
@@ -0,0 +1,10 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
from .redis_client import RedisClient
|
||||
|
||||
__all__ = [
|
||||
"RedisClient",
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: skip-file
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
from nv_ingest_client.message_clients import MessageClientBase
|
||||
from redis.exceptions import RedisError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedisClient(MessageClientBase):
|
||||
"""
|
||||
A client for interfacing with Redis, providing mechanisms for sending and receiving messages
|
||||
with retry logic and connection management.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host : str
|
||||
The hostname of the Redis server.
|
||||
port : int
|
||||
The port number of the Redis server.
|
||||
db : int, optional
|
||||
The database number to connect to. Default is 0.
|
||||
max_retries : int, optional
|
||||
The maximum number of retry attempts for operations. Default is 0 (no retries).
|
||||
max_backoff : int, optional
|
||||
The maximum backoff delay between retries in seconds. Default is 32 seconds.
|
||||
connection_timeout : int, optional
|
||||
The timeout in seconds for connecting to the Redis server. Default is 300 seconds.
|
||||
max_pool_size : int, optional
|
||||
The maximum number of connections in the Redis connection pool. Default is 128.
|
||||
use_ssl : bool, optional
|
||||
Specifies if SSL should be used for the connection. Default is False.
|
||||
redis_allocator : Any, optional
|
||||
The Redis client allocator, allowing for custom Redis client instances. Default is redis.Redis.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
client : Any
|
||||
The Redis client instance used for operations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str,
|
||||
port: int,
|
||||
db: int = 0,
|
||||
max_retries: int = 0,
|
||||
max_backoff: int = 32,
|
||||
connection_timeout: int = 300,
|
||||
max_pool_size: int = 128,
|
||||
use_ssl: bool = False,
|
||||
redis_allocator: Any = redis.Redis, # Type hint as 'Any' due to dynamic nature
|
||||
):
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._db = db
|
||||
self._max_retries = max_retries
|
||||
self._max_backoff = max_backoff
|
||||
self._connection_timeout = connection_timeout
|
||||
self._use_ssl = use_ssl
|
||||
self._pool = redis.ConnectionPool(
|
||||
host=self._host,
|
||||
port=self._port,
|
||||
db=self._db,
|
||||
socket_connect_timeout=self._connection_timeout,
|
||||
max_connections=max_pool_size,
|
||||
)
|
||||
self._redis_allocator = redis_allocator
|
||||
self._client = self._redis_allocator(connection_pool=self._pool)
|
||||
self._retries = 0
|
||||
|
||||
def _connect(self) -> None:
|
||||
"""
|
||||
Attempts to reconnect to the Redis server if the current connection is not responsive.
|
||||
"""
|
||||
if not self.ping():
|
||||
logger.debug("Reconnecting to Redis")
|
||||
self._client = self._redis_allocator(connection_pool=self._pool)
|
||||
|
||||
@property
|
||||
def max_retries(self) -> int:
|
||||
return self._max_retries
|
||||
|
||||
@max_retries.setter
|
||||
def max_retries(self, value: int) -> None:
|
||||
self._max_retries = value
|
||||
|
||||
def get_client(self) -> Any:
|
||||
"""
|
||||
Returns a Redis client instance, reconnecting if necessary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Any
|
||||
The Redis client instance.
|
||||
"""
|
||||
if self._client is None or not self.ping():
|
||||
self._connect()
|
||||
return self._client
|
||||
|
||||
def ping(self) -> bool:
|
||||
"""
|
||||
Checks if the Redis server is responsive.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the server responds to a ping, False otherwise.
|
||||
"""
|
||||
try:
|
||||
self._client.ping()
|
||||
return True
|
||||
except (RedisError, AttributeError):
|
||||
return False
|
||||
|
||||
def fetch_message(self, channel_name: str, timeout: float = 10) -> Optional[str]:
|
||||
"""
|
||||
Fetches a message from the specified queue with retries on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel_name : str
|
||||
The name of the task queue to fetch messages from.
|
||||
timeout : float
|
||||
The timeout in seconds for blocking until a message is available.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[str]
|
||||
The fetched message, or None if no message could be fetched.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If fetching the message fails after the specified number of retries or due to other critical errors.
|
||||
"""
|
||||
retries = 0
|
||||
while True:
|
||||
try:
|
||||
response = self.get_client().blpop([channel_name], timeout)
|
||||
if response and response[1]:
|
||||
return response[1]
|
||||
return None
|
||||
except RedisError as err:
|
||||
retries += 1
|
||||
logger.error(f"Redis error during fetch: {err}")
|
||||
backoff_delay = min(2**retries, self._max_backoff)
|
||||
|
||||
if self.max_retries > 0 and retries <= self.max_retries:
|
||||
logger.error(f"Fetch attempt failed, retrying in {backoff_delay}s...")
|
||||
time.sleep(backoff_delay)
|
||||
else:
|
||||
logger.error(f"Failed to fetch message from {channel_name} after {retries} attempts.")
|
||||
raise ValueError(f"Failed to fetch message from Redis queue after {retries} attempts: {err}")
|
||||
|
||||
# Invalidate client to force reconnection on the next try
|
||||
self._client = None
|
||||
except Exception as e:
|
||||
# Handle non-Redis specific exceptions
|
||||
logger.error(f"Unexpected error during fetch from {channel_name}: {e}")
|
||||
raise ValueError(f"Unexpected error during fetch: {e}")
|
||||
|
||||
def submit_message(self, channel_name: str, message: str) -> None:
|
||||
"""
|
||||
Submits a message to a specified Redis queue with retries on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel_name : str
|
||||
The name of the queue to submit the message to.
|
||||
message : str
|
||||
The message to submit.
|
||||
|
||||
Raises
|
||||
------
|
||||
RedisError
|
||||
If submitting the message fails after the specified number of retries.
|
||||
"""
|
||||
retries = 0
|
||||
while True:
|
||||
try:
|
||||
self.get_client().rpush(channel_name, message)
|
||||
logger.debug(f"Message submitted to {channel_name}")
|
||||
break
|
||||
except RedisError as e:
|
||||
logger.error(f"Failed to submit message, retrying... Error: {e}")
|
||||
self._client = None # Invalidate client to force reconnection
|
||||
retries += 1
|
||||
backoff_delay = min(2**retries, self._max_backoff)
|
||||
|
||||
if self.max_retries == 0 or retries < self.max_retries:
|
||||
logger.error(f"Submit attempt failed, retrying in {backoff_delay}s...")
|
||||
time.sleep(backoff_delay)
|
||||
else:
|
||||
logger.error(f"Failed to submit message to {channel_name} after {retries} attempts.")
|
||||
raise
|
||||
259
client/src/nv_ingest_client/nv_ingest_cli.py
Normal file
259
client/src/nv_ingest_client/nv_ingest_cli.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
|
||||
import click
|
||||
import pkg_resources
|
||||
from nv_ingest_client.cli.util.click import ClientType
|
||||
from nv_ingest_client.cli.util.click import LogLevel
|
||||
from nv_ingest_client.cli.util.click import click_match_and_validate_files
|
||||
from nv_ingest_client.cli.util.click import click_validate_batch_size
|
||||
from nv_ingest_client.cli.util.click import click_validate_file_exists
|
||||
from nv_ingest_client.cli.util.click import click_validate_task
|
||||
from nv_ingest_client.cli.util.dataset import get_dataset_files
|
||||
from nv_ingest_client.cli.util.dataset import get_dataset_statistics
|
||||
from nv_ingest_client.cli.util.processing import create_and_process_jobs
|
||||
from nv_ingest_client.cli.util.processing import report_statistics
|
||||
from nv_ingest_client.cli.util.system import configure_logging
|
||||
from nv_ingest_client.cli.util.system import ensure_directory_with_permissions
|
||||
from nv_ingest_client.client import NvIngestClient
|
||||
from nv_ingest_client.message_clients.redis import RedisClient
|
||||
from pkg_resources import DistributionNotFound
|
||||
from pkg_resources import VersionConflict
|
||||
|
||||
try:
|
||||
NV_INGEST_VERSION = pkg_resources.get_distribution("nv_ingest").version
|
||||
except (DistributionNotFound, VersionConflict):
|
||||
NV_INGEST_VERSION = "Unknown -- No Distribution found or Version conflict."
|
||||
|
||||
try:
|
||||
NV_INGEST_CLIENT_VERSION = pkg_resources.get_distribution("nv_ingest_client").version
|
||||
except (DistributionNotFound, VersionConflict):
|
||||
NV_INGEST_CLIENT_VERSION = "Unknown -- No Distribution found or Version conflict."
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--batch_size",
|
||||
default=10,
|
||||
show_default=True,
|
||||
type=int,
|
||||
help="Batch size (must be >= 1).",
|
||||
callback=click_validate_batch_size,
|
||||
)
|
||||
@click.option(
|
||||
"--doc",
|
||||
multiple=True,
|
||||
default=None,
|
||||
type=click.Path(exists=False),
|
||||
help="Add a new document to be processed (supports multiple).",
|
||||
callback=click_match_and_validate_files,
|
||||
)
|
||||
@click.option(
|
||||
"--dataset",
|
||||
type=click.Path(exists=False),
|
||||
default=None,
|
||||
help="Path to a dataset definition file.",
|
||||
callback=click_validate_file_exists,
|
||||
)
|
||||
@click.option(
|
||||
"--client",
|
||||
type=click.Choice([client.value for client in ClientType], case_sensitive=False),
|
||||
default="REDIS",
|
||||
show_default=True,
|
||||
help="Client type.",
|
||||
)
|
||||
@click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
|
||||
@click.option("--client_port", default=6397, type=int, help="Port for the client endpoint.")
|
||||
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
||||
@click.option(
|
||||
"--concurrency_n", default=10, show_default=True, type=int, help="Number of inflight jobs to maintain at one time."
|
||||
)
|
||||
@click.option(
|
||||
"--document_processing_timeout",
|
||||
default=10,
|
||||
show_default=True,
|
||||
type=int,
|
||||
help="Timeout when waiting for a document to be processed.",
|
||||
)
|
||||
@click.option("--dry_run", is_flag=True, help="Perform a dry run without executing actions.")
|
||||
@click.option("--fail_on_error", is_flag=True, help="Fail on error.")
|
||||
@click.option("--output_directory", type=click.Path(), default=None, help="Output directory for results.")
|
||||
@click.option(
|
||||
"--log_level",
|
||||
type=click.Choice([level.value for level in LogLevel], case_sensitive=False),
|
||||
default="INFO",
|
||||
show_default=True,
|
||||
help="Log level.",
|
||||
)
|
||||
@click.option(
|
||||
"--shuffle_dataset", is_flag=True, default=True, show_default=True, help="Shuffle the dataset before processing."
|
||||
)
|
||||
@click.option(
|
||||
"--task",
|
||||
multiple=True,
|
||||
callback=click_validate_task,
|
||||
help="""
|
||||
\b
|
||||
Task definitions in JSON format, allowing multiple tasks to be configured by repeating this option.
|
||||
Each task must be specified with its type and corresponding options in the '[task_id]:{json_options}' format.
|
||||
|
||||
\b
|
||||
Example:
|
||||
--task 'split:{"split_by":"page", "split_length":10}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_text":true}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_method":"eclair"}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
|
||||
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
||||
--task 'store:{"content_type":"image", "store_method":"minio", "endpoint":"minio:9000"}'
|
||||
--task 'embed:{"text":true, "tables":true}'
|
||||
--task 'vdb_upload'
|
||||
--task 'caption:{}'
|
||||
|
||||
\b
|
||||
Tasks and Options:
|
||||
- split: Divides documents according to specified criteria.
|
||||
Options:
|
||||
- split_by (str): Criteria ('page', 'size', 'word', 'sentence'). No default.
|
||||
- split_length (int): Segment length. No default.
|
||||
- split_overlap (int): Segment overlap. No default.
|
||||
- max_character_length (int): Maximum segment character count. No default.
|
||||
- sentence_window_size (int): Sentence window size. No default.
|
||||
\b
|
||||
- extract: Extracts content from documents, customizable per document type.
|
||||
Can be specified multiple times for different 'document_type' values.
|
||||
Options:
|
||||
- document_type (str): Document format ('pdf', 'docx', 'pptx', 'html', 'xml', 'excel', 'csv', 'parquet'). Required.
|
||||
- extract_method (str): Extraction technique. Defaults are smartly chosen based on 'document_type'.
|
||||
- extract_text (bool): Enables text extraction. Default: False.
|
||||
- extract_images (bool): Enables image extraction. Default: False.
|
||||
- extract_tables (bool): Enables table extraction. Default: False.
|
||||
\b
|
||||
- store: Stores any images extracted from documents.
|
||||
Options:
|
||||
- structured (bool): Flag to write extracted charts and tables to object store.
|
||||
- images (bool): Flag to write extracted images to object store.
|
||||
- store_method (str): Storage type ('minio', ). Required.
|
||||
\b
|
||||
- caption: Attempts to extract captions for images extracted from documents. Note: this is not generative, but rather a
|
||||
simple extraction.
|
||||
Options:
|
||||
N/A
|
||||
\b
|
||||
- dedup: Identifies and optionally filters duplicate images in extraction.
|
||||
Options:
|
||||
- content_type (str): Content type to deduplicate ('image')
|
||||
- filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
|
||||
\b
|
||||
- filter: Identifies and optionally filters images above or below scale thresholds.
|
||||
Options:
|
||||
- content_type (str): Content type to deduplicate ('image')
|
||||
- min_size: (Union[float, int]): Minimum allowable size of extracted image.
|
||||
- max_aspect_ratio: (Union[float, int]): Maximum allowable aspect ratio of extracted image.
|
||||
- min_aspect_ratio: (Union[float, int]): Minimum allowable aspect ratio of extracted image.
|
||||
- filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
|
||||
\b
|
||||
- embed: Computes embeddings on multimodal extractions.
|
||||
Options:
|
||||
- text (bool): Flag to create embeddings for text extractions. Optional.
|
||||
- tables (bool): Flag to creae embeddings for table extractions. Optional.
|
||||
- filter_errors (bool): Flag to filter embedding errors. Optional.
|
||||
\b
|
||||
- vdb_upload: Uploads extraction embeddings to vector database.
|
||||
\b
|
||||
Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
|
||||
""",
|
||||
)
|
||||
@click.option("--version", is_flag=True, help="Show version.")
|
||||
@click.pass_context
|
||||
def main(
|
||||
ctx,
|
||||
batch_size: int,
|
||||
client_host: str,
|
||||
client_kwargs: str,
|
||||
client_port: int,
|
||||
client: str,
|
||||
concurrency_n: int,
|
||||
dataset: str,
|
||||
doc: List[str],
|
||||
document_processing_timeout: int,
|
||||
dry_run: bool,
|
||||
fail_on_error: bool,
|
||||
log_level: str,
|
||||
output_directory: str,
|
||||
shuffle_dataset: bool,
|
||||
task: [str],
|
||||
version: [bool],
|
||||
):
|
||||
if version:
|
||||
click.echo(f"nv-ingest : {NV_INGEST_VERSION}")
|
||||
click.echo(f"nv-ingest-cli : {NV_INGEST_CLIENT_VERSION}")
|
||||
return
|
||||
|
||||
try:
|
||||
configure_logging(logger, log_level)
|
||||
logging.debug(f"nv-ingest-cli:params:\n{json.dumps(ctx.params, indent=2, default=repr)}")
|
||||
|
||||
docs = list(doc)
|
||||
if dataset:
|
||||
dataset = dataset[0]
|
||||
logger.info(f"Processing dataset: {dataset}")
|
||||
with open(dataset, "rb") as file:
|
||||
dataset_bytes = BytesIO(file.read())
|
||||
|
||||
logger.debug(get_dataset_statistics(dataset_bytes))
|
||||
docs.extend(get_dataset_files(dataset_bytes, shuffle_dataset))
|
||||
|
||||
logger.info(f"Processing {len(docs)} documents.")
|
||||
if output_directory:
|
||||
_msg = f"Output will be written to: {output_directory}"
|
||||
if dry_run:
|
||||
_msg = f"[Dry-Run] {_msg}"
|
||||
else:
|
||||
ensure_directory_with_permissions(output_directory)
|
||||
|
||||
logger.info(_msg)
|
||||
|
||||
if not dry_run:
|
||||
logging.debug(
|
||||
f"Creating message client: {client} with host: {client_host} and port: {client_port} -> {client_kwargs}"
|
||||
)
|
||||
client_allocator = RedisClient
|
||||
|
||||
ingest_client = NvIngestClient(
|
||||
message_client_allocator=client_allocator,
|
||||
message_client_hostname=client_host,
|
||||
message_client_port=client_port,
|
||||
message_client_kwargs=json.loads(client_kwargs),
|
||||
worker_pool_size=concurrency_n,
|
||||
)
|
||||
|
||||
start_time_ns = time.time_ns()
|
||||
(total_files, trace_times, pages_processed, total_timeouts) = create_and_process_jobs(
|
||||
files=docs,
|
||||
client=ingest_client,
|
||||
tasks=task,
|
||||
output_directory=output_directory,
|
||||
batch_size=batch_size,
|
||||
timeout=document_processing_timeout,
|
||||
fail_on_error=fail_on_error,
|
||||
)
|
||||
|
||||
report_statistics(start_time_ns, trace_times, pages_processed, total_files, total_timeouts)
|
||||
|
||||
except Exception as err:
|
||||
logging.error(f"Error: {err}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
8
client/src/nv_ingest_client/primitives/__init__.py
Normal file
8
client/src/nv_ingest_client/primitives/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .jobs import JobSpec
|
||||
from .tasks import Task
|
||||
|
||||
__all__ = ["JobSpec", "Task"]
|
||||
9
client/src/nv_ingest_client/primitives/jobs/__init__.py
Normal file
9
client/src/nv_ingest_client/primitives/jobs/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .job_spec import JobSpec
|
||||
from .job_state import JobState
|
||||
from .job_state import JobStateEnum
|
||||
|
||||
__all__ = ["JobSpec", "JobState", "JobStateEnum"]
|
||||
153
client/src/nv_ingest_client/primitives/jobs/job_spec.py
Normal file
153
client/src/nv_ingest_client/primitives/jobs/job_spec.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import logging
|
||||
import typing
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
from nv_ingest_client.primitives.tasks import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobSpec:
|
||||
"""
|
||||
Specification for creating a job for submission to the nv-ingest microservice.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
payload : Dict
|
||||
The payload data for the job.
|
||||
tasks : Optional[List], optional
|
||||
A list of tasks to be added to the job, by default None.
|
||||
source_id : Optional[str], optional
|
||||
An identifier for the source of the job, by default None.
|
||||
job_id : Optional[UUID], optional
|
||||
A unique identifier for the job, by default a new UUID is generated.
|
||||
extended_options : Optional[Dict], optional
|
||||
Additional options for job processing, by default None.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
_payload : Dict
|
||||
Storage for the payload data.
|
||||
_tasks : List
|
||||
Storage for the list of tasks.
|
||||
_source_id : str
|
||||
Storage for the source identifier.
|
||||
_job_id : UUID
|
||||
Storage for the job's unique identifier.
|
||||
_extended_options : Dict
|
||||
Storage for the additional options.
|
||||
|
||||
Methods
|
||||
-------
|
||||
to_dict() -> Dict:
|
||||
Converts the job specification to a dictionary.
|
||||
add_task(task):
|
||||
Adds a task to the job specification.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
payload: str = None,
|
||||
tasks: Optional[List] = None,
|
||||
source_id: Optional[str] = None,
|
||||
source_name: Optional[str] = None,
|
||||
document_type: Optional[str] = None,
|
||||
job_id: Optional[typing.Union[UUID, str]] = None,
|
||||
extended_options: Optional[Dict] = None,
|
||||
) -> None:
|
||||
self._document_type = document_type or "txt"
|
||||
self._extended_options = extended_options or {}
|
||||
self._job_id = job_id
|
||||
self._payload = payload
|
||||
self._source_id = source_id
|
||||
self._source_name = source_name
|
||||
self._tasks = tasks or []
|
||||
|
||||
def __str__(self) -> str:
|
||||
task_info = "\n".join(str(task) for task in self._tasks)
|
||||
return (
|
||||
f"job-id: {self._job_id}\n"
|
||||
f"source-id: {self._source_id}\n"
|
||||
f"task count: {len(self._tasks)}\n"
|
||||
f"{task_info}"
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Converts the job specification instance into a dictionary suitable for JSON serialization.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict
|
||||
A dictionary representation of the job specification.
|
||||
"""
|
||||
return {
|
||||
"job_payload": {
|
||||
"source_name": [self._source_name],
|
||||
"source_id": [self._source_id],
|
||||
"content": [self._payload],
|
||||
"document_type": [self._document_type],
|
||||
},
|
||||
"job_id": str(self._job_id),
|
||||
"tasks": [task.to_dict() for task in self._tasks],
|
||||
"tracing_options": self._extended_options.get("tracing_options", {}),
|
||||
}
|
||||
|
||||
@property
|
||||
def payload(self) -> Dict:
|
||||
return self._payload
|
||||
|
||||
@payload.setter
|
||||
def payload(self, payload: Dict) -> None:
|
||||
self._payload = payload
|
||||
|
||||
@property
|
||||
def job_id(self) -> UUID:
|
||||
return self._job_id
|
||||
|
||||
@job_id.setter
|
||||
def job_id(self, job_id: UUID) -> None:
|
||||
self._job_id = job_id
|
||||
|
||||
@property
|
||||
def source_id(self) -> str:
|
||||
return self._source_id
|
||||
|
||||
@source_id.setter
|
||||
def source_id(self, source_id: str) -> None:
|
||||
self._source_id = source_id
|
||||
|
||||
@property
|
||||
def source_name(self) -> str:
|
||||
return self._source_name
|
||||
|
||||
@source_name.setter
|
||||
def source_name(self, source_name: str) -> None:
|
||||
self._source_name = source_name
|
||||
|
||||
def add_task(self, task) -> None:
|
||||
"""
|
||||
Adds a task to the job specification.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
task
|
||||
The task to add to the job specification. Assumes the task has a to_dict method.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the task does not have a to_dict method.
|
||||
"""
|
||||
if not isinstance(task, Task):
|
||||
raise ValueError("Task must derive from nv_ingest_client.primitives.Task class")
|
||||
|
||||
self._tasks.append(task)
|
||||
157
client/src/nv_ingest_client/primitives/jobs/job_state.py
Normal file
157
client/src/nv_ingest_client/primitives/jobs/job_state.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import logging
|
||||
from concurrent.futures import Future
|
||||
from enum import Enum
|
||||
from enum import auto
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
from uuid import UUID
|
||||
|
||||
from .job_spec import JobSpec
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobStateEnum(Enum):
|
||||
"""
|
||||
Enumeration of possible states for a job in the NvIngestClient system.
|
||||
"""
|
||||
|
||||
PENDING = auto() # Job has been created but not yet submitted or processed.
|
||||
SUBMITTED_ASYNC = auto() # Job has been submitted to the queue asynchronously.
|
||||
SUBMITTED = auto() # Job has been submitted to the queue.
|
||||
PROCESSING = auto() # Job is currently being processed.
|
||||
COMPLETED = auto() # Job has completed processing successfully.
|
||||
FAILED = auto() # Job has failed during processing.
|
||||
CANCELLED = auto() # Job has been cancelled before completion.
|
||||
|
||||
|
||||
_TERMINAL_STATES = {JobStateEnum.COMPLETED, JobStateEnum.FAILED, JobStateEnum.CANCELLED}
|
||||
_PREFLIGHT_STATES = {JobStateEnum.PENDING, JobStateEnum.SUBMITTED_ASYNC}
|
||||
|
||||
|
||||
class JobState:
|
||||
"""
|
||||
Encapsulates the state information for a job managed by the NvIngestClient.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
job_spec: JobSpec
|
||||
The unique identifier for the job.
|
||||
state : str
|
||||
The current state of the job.
|
||||
future : Future, optional
|
||||
The future object associated with the job's asynchronous operation.
|
||||
response : Dict, optional
|
||||
The response data received for the job.
|
||||
response_channel : str, optional
|
||||
The channel through which responses for the job are received.
|
||||
|
||||
Methods
|
||||
-------
|
||||
__init__(self, job_id: str, state: str, future: Optional[Future] = None,
|
||||
response: Optional[Dict] = None, response_channel: Optional[str] = None)
|
||||
Initializes a new instance of JobState.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
job_spec: JobSpec,
|
||||
state: JobStateEnum = JobStateEnum.PENDING,
|
||||
future: Optional[Future] = None,
|
||||
response: Optional[Dict] = None,
|
||||
response_channel: Optional[str] = None,
|
||||
) -> None:
|
||||
self._job_spec = job_spec
|
||||
self._state = state
|
||||
self._future = future
|
||||
self._response = response # TODO(Devin): Not currently used
|
||||
self._response_channel = response_channel
|
||||
self._telemetry = {}
|
||||
|
||||
@property
|
||||
def job_spec(self) -> JobSpec:
|
||||
"""Gets the job specification associated with the state."""
|
||||
return self._job_spec
|
||||
|
||||
@job_spec.setter
|
||||
def job_spec(self, value: JobSpec) -> None:
|
||||
"""Sets the job specification associated with the state."""
|
||||
if self._state not in _PREFLIGHT_STATES:
|
||||
err_msg = f"Attempt to change job_spec after job submission: {self._state.name}"
|
||||
logger.error(err_msg)
|
||||
|
||||
raise ValueError(err_msg)
|
||||
|
||||
self._job_spec = value
|
||||
|
||||
@property
|
||||
def job_id(self) -> Union[UUID, str]:
|
||||
"""Gets the job's unique identifier."""
|
||||
return self._job_spec.job_id
|
||||
|
||||
@job_id.setter
|
||||
def job_id(self, value: str) -> None:
|
||||
"""Sets the job's unique identifier, with constraints."""
|
||||
if self._state not in _PREFLIGHT_STATES:
|
||||
err_msg = f"Attempt to change job_id after job submission: {self._state.name}"
|
||||
logger.error(err_msg)
|
||||
raise ValueError(err_msg)
|
||||
self._job_spec.job_id = value
|
||||
|
||||
@property
|
||||
def state(self) -> JobStateEnum:
|
||||
"""Gets the current state of the job."""
|
||||
return self._state
|
||||
|
||||
@state.setter
|
||||
def state(self, value: JobStateEnum) -> None:
|
||||
"""Sets the current state of the job with transition constraints."""
|
||||
if self._state in _TERMINAL_STATES:
|
||||
logger.error(f"Attempt to change state from {self._state.name} to {value.name} denied.")
|
||||
raise ValueError(f"Cannot change state from {self._state.name} to {value.name}.")
|
||||
if value.value < self._state.value:
|
||||
logger.error(f"Invalid state transition attempt from {self._state.name} to {value.name}.")
|
||||
raise ValueError(f"State can only transition forward, from {self._state.name} to {value.name} not allowed.")
|
||||
self._state = value
|
||||
|
||||
@property
|
||||
def future(self) -> Optional[Future]:
|
||||
"""Gets the future object associated with the job's asynchronous operation."""
|
||||
return self._future
|
||||
|
||||
@future.setter
|
||||
def future(self, value: Future) -> None:
|
||||
"""Sets the future object associated with the job's asynchronous operation, with constraints."""
|
||||
self._future = value
|
||||
|
||||
# TODO(Devin): Not convinced we need 'response' probably remove.
|
||||
@property
|
||||
def response(self) -> Optional[Dict]:
|
||||
"""Gets the response data received for the job."""
|
||||
return self._response
|
||||
|
||||
@response.setter
|
||||
def response(self, value: Dict) -> None:
|
||||
"""Sets the response data received for the job, with constraints."""
|
||||
self._response = value
|
||||
|
||||
@property
|
||||
def response_channel(self) -> Optional[str]:
|
||||
"""Gets the channel through which responses for the job are received."""
|
||||
return self._response_channel
|
||||
|
||||
@response_channel.setter
|
||||
def response_channel(self, value: str) -> None:
|
||||
"""Sets the channel through which responses for the job are received, with constraints."""
|
||||
if self._state not in _PREFLIGHT_STATES:
|
||||
err_msg = f"Attempt to change response_channel after job submission: {self._state.name}"
|
||||
logger.error(err_msg)
|
||||
raise ValueError(err_msg)
|
||||
|
||||
self._response_channel = value
|
||||
31
client/src/nv_ingest_client/primitives/tasks/__init__.py
Normal file
31
client/src/nv_ingest_client/primitives/tasks/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .caption import CaptionTask
|
||||
from .dedup import DedupTask
|
||||
from .embed import EmbedTask
|
||||
from .extract import ExtractTask
|
||||
from .filter import FilterTask
|
||||
from .split import SplitTask
|
||||
from .store import StoreTask
|
||||
from .task_base import Task
|
||||
from .task_base import TaskType
|
||||
from .task_base import is_valid_task_type
|
||||
from .task_factory import task_factory
|
||||
from .vdb_upload import VdbUploadTask
|
||||
|
||||
__all__ = [
|
||||
"CaptionTask",
|
||||
"ExtractTask",
|
||||
"is_valid_task_type",
|
||||
"SplitTask",
|
||||
"StoreTask",
|
||||
"Task",
|
||||
"task_factory",
|
||||
"TaskType",
|
||||
"DedupTask",
|
||||
"FilterTask",
|
||||
"EmbedTask",
|
||||
"VdbUploadTask",
|
||||
]
|
||||
45
client/src/nv_ingest_client/primitives/tasks/caption.py
Normal file
45
client/src/nv_ingest_client/primitives/tasks/caption.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CaptionTaskSchema(BaseModel):
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class CaptionTask(Task):
|
||||
def __init__(
|
||||
self,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
task_properties = {
|
||||
"content_type": "image",
|
||||
}
|
||||
|
||||
return {"type": "caption", "task_properties": task_properties}
|
||||
76
client/src/nv_ingest_client/primitives/tasks/dedup.py
Normal file
76
client/src/nv_ingest_client/primitives/tasks/dedup.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import validator
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DedupTaskSchema(BaseModel):
|
||||
content_type: str = "image"
|
||||
filter: bool = False
|
||||
|
||||
@validator("content_type")
|
||||
def content_type_must_be_valid(cls, v):
|
||||
valid_criteria = ["image"]
|
||||
if v not in valid_criteria:
|
||||
raise ValueError(f"content_type must be one of {valid_criteria}")
|
||||
return v
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class DedupTask(Task):
|
||||
"""
|
||||
Object for document dedup task
|
||||
"""
|
||||
|
||||
_TypeContentType = Literal["image"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
content_type: _TypeContentType = "image",
|
||||
filter: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Setup Dedup Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
self._content_type = content_type
|
||||
self._filter = filter
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Dedup Task:\n"
|
||||
info += f" content_type: {self._content_type}\n"
|
||||
info += f" filter: {self._filter}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
dedup_params = {"filter": self._filter}
|
||||
|
||||
task_properties = {
|
||||
"content_type": self._content_type,
|
||||
"params": dedup_params,
|
||||
}
|
||||
|
||||
return {"type": "dedup", "task_properties": task_properties}
|
||||
64
client/src/nv_ingest_client/primitives/tasks/embed.py
Normal file
64
client/src/nv_ingest_client/primitives/tasks/embed.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EmbedTaskSchema(BaseModel):
|
||||
text: bool = True
|
||||
tables: bool = True
|
||||
filter_errors: bool = False
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class EmbedTask(Task):
|
||||
"""
|
||||
Object for document embedding task
|
||||
"""
|
||||
|
||||
def __init__(self, text: bool = True, tables: bool = True, filter_errors: bool = False) -> None:
|
||||
"""
|
||||
Setup Embed Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
self._text = text
|
||||
self._tables = tables
|
||||
self._filter_errors = filter_errors
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Embed Task:\n"
|
||||
info += f" text: {self._text}\n"
|
||||
info += f" tables: {self._tables}\n"
|
||||
info += f" filter_errors: {self._filter_errors}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
|
||||
task_properties = {
|
||||
"text": self._text,
|
||||
"tables": self._tables,
|
||||
"filter_errors": False,
|
||||
}
|
||||
|
||||
return {"type": "embed", "task_properties": task_properties}
|
||||
224
client/src/nv_ingest_client/primitives/tasks/extract.py
Normal file
224
client/src/nv_ingest_client/primitives/tasks/extract.py
Normal file
@@ -0,0 +1,224 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict
|
||||
from typing import Literal
|
||||
from typing import get_args
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import root_validator
|
||||
from pydantic import validator
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ECLAIR_TRITON_HOST = os.environ.get("ECLAIR_TRITON_HOST", "localhost")
|
||||
ECLAIR_TRITON_PORT = os.environ.get("ECLAIR_TRITON_PORT", "8001")
|
||||
ECLAIR_BATCH_SIZE = os.environ.get("ECLAIR_TRITON_PORT", "16")
|
||||
|
||||
UNSTRUCTURED_API_KEY = os.environ.get("UNSTRUCTURED_API_KEY", None)
|
||||
UNSTRUCTURED_URL = os.environ.get("UNSTRUCTURED_URL", "https://api.unstructured.io/general/v0/general")
|
||||
UNSTRUCTURED_STRATEGY = os.environ.get("UNSTRUCTURED_STRATEGY", "auto")
|
||||
UNSTRUCTURED_CONCURRENCY_LEVEL = os.environ.get("UNSTRUCTURED_CONCURRENCY_LEVEL", 10)
|
||||
|
||||
ADOBE_CLIENT_ID = os.environ.get("ADOBE_CLIENT_ID", None)
|
||||
ADOBE_CLIENT_SECRET = os.environ.get("ADOBE_CLIENT_SECRET", None)
|
||||
|
||||
_DEFAULT_EXTRACTOR_MAP = {
|
||||
"pdf": "pdfium",
|
||||
"docx": "python_docx",
|
||||
"pptx": "python_pptx",
|
||||
"html": "beautifulsoup",
|
||||
"xml": "lxml",
|
||||
"excel": "openpyxl",
|
||||
"csv": "pandas",
|
||||
"parquet": "pandas",
|
||||
}
|
||||
|
||||
_Type_Extract_Method_PDF = Literal[
|
||||
"pdfium",
|
||||
"eclair",
|
||||
"haystack",
|
||||
"tika",
|
||||
"unstructured_io",
|
||||
"llama_parse",
|
||||
"adobe",
|
||||
]
|
||||
|
||||
_Type_Extract_Method_DOCX = Literal["python_docx", "haystack", "unstructured_local", "unstructured_service"]
|
||||
|
||||
_Type_Extract_Method_PPTX = Literal["python_pptx", "haystack", "unstructured_local", "unstructured_service"]
|
||||
|
||||
_Type_Extract_Method_Map = {
|
||||
"pdf": get_args(_Type_Extract_Method_PDF),
|
||||
"docx": get_args(_Type_Extract_Method_DOCX),
|
||||
"pptx": get_args(_Type_Extract_Method_PPTX),
|
||||
}
|
||||
|
||||
_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium"]
|
||||
|
||||
_Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
|
||||
|
||||
_Type_Extract_Tables_Method_PPTX = Literal["python_pptx",]
|
||||
|
||||
_Type_Extract_Tables_Method_Map = {
|
||||
"pdf": get_args(_Type_Extract_Tables_Method_PDF),
|
||||
"docx": get_args(_Type_Extract_Tables_Method_DOCX),
|
||||
"pptx": get_args(_Type_Extract_Tables_Method_PPTX),
|
||||
}
|
||||
|
||||
|
||||
class ExtractTaskSchema(BaseModel):
|
||||
document_type: str
|
||||
extract_method: str = None # Initially allow None to set a smart default
|
||||
extract_text: bool = (True,)
|
||||
extract_images: bool = (True,)
|
||||
extract_tables: bool = False
|
||||
extract_tables_method: str = "yolox"
|
||||
text_depth: str = "document"
|
||||
|
||||
@root_validator(pre=True)
|
||||
def set_default_extract_method(cls, values):
|
||||
document_type = values.get("document_type", "").lower() # Ensure case-insensitive comparison
|
||||
extract_method = values.get("extract_method")
|
||||
|
||||
if document_type not in _DEFAULT_EXTRACTOR_MAP:
|
||||
raise ValueError(
|
||||
f"Unsupported document type: {document_type}."
|
||||
f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
||||
)
|
||||
|
||||
if extract_method is None:
|
||||
values["extract_method"] = _DEFAULT_EXTRACTOR_MAP[document_type]
|
||||
return values
|
||||
|
||||
@validator("extract_method")
|
||||
def extract_method_must_be_valid(cls, v, values, **kwargs):
|
||||
document_type = values.get("document_type", "").lower() # Ensure case-insensitive comparison
|
||||
valid_methods = set(_Type_Extract_Method_Map[document_type])
|
||||
if v not in valid_methods:
|
||||
raise ValueError(f"extract_method must be one of {valid_methods}")
|
||||
return v
|
||||
|
||||
@validator("document_type")
|
||||
def document_type_must_be_supported(cls, v):
|
||||
if v.lower() not in _DEFAULT_EXTRACTOR_MAP:
|
||||
raise ValueError(
|
||||
f"Unsupported document type '{v}'. Supported types are: {', '.join(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
||||
)
|
||||
return v.lower()
|
||||
|
||||
@validator("extract_tables_method")
|
||||
def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
|
||||
document_type = values.get("document_type", "").lower() # Ensure case-insensitive comparison
|
||||
valid_methods = set(_Type_Extract_Tables_Method_Map[document_type])
|
||||
if v not in valid_methods:
|
||||
raise ValueError(f"extract_method must be one of {valid_methods}")
|
||||
return v
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class ExtractTask(Task):
|
||||
"""
|
||||
Object for document extraction task
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
document_type,
|
||||
extract_method: _Type_Extract_Method_PDF = "pdfium",
|
||||
extract_text: bool = False,
|
||||
extract_images: bool = False,
|
||||
extract_tables: bool = False,
|
||||
extract_tables_method: _Type_Extract_Tables_Method_PDF = "yolox",
|
||||
text_depth: str = "document",
|
||||
) -> None:
|
||||
"""
|
||||
Setup Extract Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._document_type = document_type
|
||||
self._extract_images = extract_images
|
||||
self._extract_method = extract_method
|
||||
self._extract_tables = extract_tables
|
||||
self._extract_tables_method = extract_tables_method
|
||||
self._extract_text = extract_text
|
||||
self._text_depth = text_depth
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Extract Task:\n"
|
||||
info += f" document type: {self._document_type}\n"
|
||||
info += f" extract method: {self._extract_method}\n"
|
||||
info += f" extract text: {self._extract_text}\n"
|
||||
info += f" extract images: {self._extract_images}\n"
|
||||
info += f" extract tables: {self._extract_tables}\n"
|
||||
info += f" extract tables method: {self._extract_tables_method}\n"
|
||||
info += f" text depth: {self._text_depth}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis (fixme)
|
||||
"""
|
||||
extract_params = {
|
||||
"extract_text": self._extract_text,
|
||||
"extract_images": self._extract_images,
|
||||
"extract_tables": self._extract_tables,
|
||||
"extract_tables_method": self._extract_tables_method,
|
||||
"text_depth": self._text_depth,
|
||||
}
|
||||
|
||||
task_properties = {
|
||||
"method": self._extract_method,
|
||||
"document_type": self._document_type,
|
||||
"params": extract_params,
|
||||
}
|
||||
|
||||
# TODO(Devin): I like the idea of Derived classes augmenting the to_dict method, but its not logically
|
||||
# consistent with how we define tasks, we don't have multiple extract tasks, we have extraction paths based on
|
||||
# the method and the document type.
|
||||
if self._extract_method == "unstructured_local":
|
||||
unstructured_properties = {
|
||||
"api_key": "", # TODO(Devin): Should be an environment variable or configurable parameter
|
||||
"unstructured_url": "", # TODO(Devin): Should be an environment variable
|
||||
}
|
||||
task_properties["params"].update(unstructured_properties)
|
||||
elif self._extract_method == "eclair":
|
||||
eclair_properties = {
|
||||
"eclair_triton_host": os.environ.get("ECLAIR_TRITON_HOST", ECLAIR_TRITON_HOST),
|
||||
"eclair_triton_port": os.environ.get("ECLAIR_TRITON_PORT", ECLAIR_TRITON_PORT),
|
||||
"eclair_batch_size": os.environ.get("ECLAIR_BATCH_SIZE", ECLAIR_BATCH_SIZE),
|
||||
}
|
||||
task_properties["params"].update(eclair_properties)
|
||||
elif self._extract_method == "unstructured_io":
|
||||
unstructured_properties = {
|
||||
"unstructured_api_key": os.environ.get("UNSTRUCTURED_API_KEY", UNSTRUCTURED_API_KEY),
|
||||
"unstructured_url": os.environ.get("UNSTRUCTURED_URL", UNSTRUCTURED_URL),
|
||||
"unstructured_strategy": os.environ.get("UNSTRUCTURED_STRATEGY", UNSTRUCTURED_STRATEGY),
|
||||
"unstructured_concurrency_level": os.environ.get(
|
||||
"UNSTRUCTURED_CONCURRENCY_LEVEL", UNSTRUCTURED_CONCURRENCY_LEVEL
|
||||
),
|
||||
}
|
||||
task_properties["params"].update(unstructured_properties)
|
||||
elif self._extract_method == "adobe":
|
||||
adobe_properties = {
|
||||
"adobe_client_id": os.environ.get("ADOBE_CLIENT_ID", ADOBE_CLIENT_ID),
|
||||
"adobe_client_secrect": os.environ.get("ADOBE_CLIENT_SECRET", ADOBE_CLIENT_SECRET),
|
||||
}
|
||||
task_properties["params"].update(adobe_properties)
|
||||
return {"type": "extract", "task_properties": task_properties}
|
||||
94
client/src/nv_ingest_client/primitives/tasks/filter.py
Normal file
94
client/src/nv_ingest_client/primitives/tasks/filter.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
from typing import Literal
|
||||
from typing import Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import validator
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilterTaskSchema(BaseModel):
|
||||
content_type: str = "image"
|
||||
min_size: int = 128
|
||||
max_aspect_ratio: Union[float, int] = 5.0
|
||||
min_aspect_ratio: Union[float, int] = 0.2
|
||||
filter: bool = False
|
||||
|
||||
@validator("content_type")
|
||||
def content_type_must_be_valid(cls, v):
|
||||
valid_criteria = ["image"]
|
||||
if v not in valid_criteria:
|
||||
raise ValueError(f"content_type must be one of {valid_criteria}")
|
||||
return v
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class FilterTask(Task):
|
||||
"""
|
||||
Object for document filter task
|
||||
"""
|
||||
|
||||
_TypeContentType = Literal["image"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
content_type: _TypeContentType = "image",
|
||||
min_size: int = 128,
|
||||
max_aspect_ratio: Union[int, float] = 5.0,
|
||||
min_aspect_ratio: Union[int, float] = 0.2,
|
||||
filter: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Setup Split Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
self._content_type = content_type
|
||||
self._min_size = min_size
|
||||
self._max_aspect_ratio = max_aspect_ratio
|
||||
self._min_aspect_ratio = min_aspect_ratio
|
||||
self._filter = filter
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Filter Task:\n"
|
||||
info += f" content_type: {self._content_type}\n"
|
||||
info += f" min_size: {self._min_size}\n"
|
||||
info += f" max_aspect_ratio: {self._max_aspect_ratio}\n"
|
||||
info += f" min_aspect_ratio: {self._min_aspect_ratio}\n"
|
||||
info += f" filter: {self._filter}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
filter_params = {
|
||||
"min_size": self._min_size,
|
||||
"max_aspect_ratio": self._max_aspect_ratio,
|
||||
"min_aspect_ratio": self._min_aspect_ratio,
|
||||
"filter": self._filter,
|
||||
}
|
||||
|
||||
task_properties = {
|
||||
"content_type": self._content_type,
|
||||
"params": filter_params,
|
||||
}
|
||||
|
||||
return {"type": "filter", "task_properties": task_properties}
|
||||
95
client/src/nv_ingest_client/primitives/tasks/split.py
Normal file
95
client/src/nv_ingest_client/primitives/tasks/split.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
from typing import Literal
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import validator
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SplitTaskSchema(BaseModel):
|
||||
split_by: Optional[str] = "sentence"
|
||||
split_length: Optional[int] = 10
|
||||
split_overlap: Optional[int] = 0
|
||||
max_character_length: Optional[int] = 1024
|
||||
sentence_window_size: Optional[int] = 0
|
||||
|
||||
@validator("split_by")
|
||||
def split_by_must_be_valid(cls, v):
|
||||
valid_criteria = ["page", "size", "word", "sentence"]
|
||||
if v not in valid_criteria:
|
||||
raise ValueError(f"split_by must be one of {valid_criteria}")
|
||||
return v
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class SplitTask(Task):
|
||||
"""
|
||||
Object for document splitting task
|
||||
"""
|
||||
|
||||
_TypeSplitBy = Literal["word", "sentence", "passage"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
split_by: _TypeSplitBy = None,
|
||||
split_length: int = None,
|
||||
split_overlap: int = None,
|
||||
max_character_length: int = None,
|
||||
sentence_window_size: int = None,
|
||||
) -> None:
|
||||
"""
|
||||
Setup Split Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
self._split_by = split_by
|
||||
self._split_length = split_length
|
||||
self._split_overlap = split_overlap
|
||||
self._max_character_length = max_character_length
|
||||
self._sentence_window_size = sentence_window_size
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Split Task:\n"
|
||||
info += f" split_by: {self._split_by}\n"
|
||||
info += f" split_length: {self._split_length}\n"
|
||||
info += f" split_overlap: {self._split_overlap}\n"
|
||||
info += f" split_max_character_length: {self._max_character_length}\n"
|
||||
info += f" split_sentence_window_size: {self._sentence_window_size}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
split_params = {}
|
||||
|
||||
if self._split_by is not None:
|
||||
split_params["split_by"] = self._split_by
|
||||
if self._split_length is not None:
|
||||
split_params["split_length"] = self._split_length
|
||||
if self._split_overlap is not None:
|
||||
split_params["split_overlap"] = self._split_overlap
|
||||
if self._max_character_length is not None:
|
||||
split_params["max_character_length"] = self._max_character_length
|
||||
if self._sentence_window_size is not None:
|
||||
split_params["sentence_window_size"] = self._sentence_window_size
|
||||
|
||||
return {"type": "split", "task_properties": split_params}
|
||||
88
client/src/nv_ingest_client/primitives/tasks/store.py
Normal file
88
client/src/nv_ingest_client/primitives/tasks/store.py
Normal file
@@ -0,0 +1,88 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import root_validator
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_STORE_METHOD = "minio"
|
||||
|
||||
|
||||
class StoreTaskSchema(BaseModel):
|
||||
store_method: str = None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def set_default_store_method(cls, values):
|
||||
store_method = values.get("store_method")
|
||||
|
||||
if store_method is None:
|
||||
values["store_method"] = _DEFAULT_STORE_METHOD
|
||||
return values
|
||||
|
||||
class Config:
|
||||
extra = "allow"
|
||||
|
||||
|
||||
class StoreTask(Task):
|
||||
"""
|
||||
Object for image storage task.
|
||||
"""
|
||||
|
||||
_Type_Content_Type = Literal["image",]
|
||||
|
||||
_Type_Store_Method = Literal["minio",]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
structured: bool = True,
|
||||
images: bool = False,
|
||||
store_method: _Type_Store_Method = None,
|
||||
**extra_params,
|
||||
) -> None:
|
||||
"""
|
||||
Setup Store Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._structured = structured
|
||||
self._images = images
|
||||
self._store_method = store_method or "minio"
|
||||
self._extra_params = extra_params
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "Store Task:\n"
|
||||
info += f" store structured types: {self._structured}\n"
|
||||
info += f" store image types: {self._images}\n"
|
||||
info += f" store method: {self._store_method}\n"
|
||||
for key, value in self._extra_params.items():
|
||||
info += f" {key}: {value}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis (fixme)
|
||||
"""
|
||||
task_properties = {
|
||||
"method": self._store_method,
|
||||
"structured": self._structured,
|
||||
"images": self._images,
|
||||
"params": self._extra_params,
|
||||
}
|
||||
|
||||
return {"type": "store", "task_properties": task_properties}
|
||||
140
client/src/nv_ingest_client/primitives/tasks/task_base.py
Normal file
140
client/src/nv_ingest_client/primitives/tasks/task_base.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from enum import auto
|
||||
from typing import Dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TaskType(Enum):
|
||||
CAPTION = auto()
|
||||
EMBED = auto()
|
||||
EXTRACT = auto()
|
||||
FILTER = auto()
|
||||
SPLIT = auto()
|
||||
TRANSFORM = auto()
|
||||
STORE = auto()
|
||||
VDB_UPLOAD = auto()
|
||||
|
||||
|
||||
def is_valid_task_type(task_type_str: str) -> bool:
|
||||
"""
|
||||
Checks if the provided string is a valid TaskType enum value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
task_type_str : str
|
||||
The string to check against the TaskType enum values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the string is a valid TaskType enum value, False otherwise.
|
||||
"""
|
||||
return task_type_str in TaskType.__members__
|
||||
|
||||
|
||||
class Task:
|
||||
"""
|
||||
Generic task Object
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Setup Ingest Task Config
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += f"{self.__class__.__name__}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Returns a string with the task specification. This string is used for constructing
|
||||
tasks that are then submitted to the redis client
|
||||
"""
|
||||
return {}
|
||||
|
||||
|
||||
# class ExtractUnstructuredTask(ExtractTask):
|
||||
# """
|
||||
# Object for document unstructured extraction task
|
||||
# extract_method = ["unstructured_local", "unstructured_service"]
|
||||
# """
|
||||
#
|
||||
# def __init__(
|
||||
# self,
|
||||
# extract_method: ExtractTask._Type_Extract_Method,
|
||||
# document_type: ExtractTask._TypeDocumentType,
|
||||
# api_key: str,
|
||||
# uri: str,
|
||||
# ) -> None:
|
||||
# """
|
||||
# Setup Extract Task Config
|
||||
# """
|
||||
# super().__init__(extract_method, document_type)
|
||||
# self._api_key = api_key
|
||||
# self._uri = uri
|
||||
#
|
||||
# def __str__(self) -> str:
|
||||
# """
|
||||
# Returns a string with the object's config and run time state
|
||||
# """
|
||||
# info = ""
|
||||
# info += super().__str__()
|
||||
# info += f"unstructured uri: {self._uri}\n"
|
||||
# return info
|
||||
#
|
||||
# def to_dict(self) -> Dict:
|
||||
# """
|
||||
# Convert to a dict for submission to redis (fixme)
|
||||
# """
|
||||
# unstructured_properties = {
|
||||
# "api_key": self._api_key,
|
||||
# "unstructured_url": self._uri,
|
||||
# }
|
||||
# task_desc = super().to_dict()
|
||||
# task_desc["task_properties"]["params"].update(unstructured_properties)
|
||||
# return task_desc
|
||||
|
||||
|
||||
# class ExtractLlamaParseTask(ExtractTask):
|
||||
# """
|
||||
# Object for document llama extraction task
|
||||
# extract_method = ["llama_parse"]
|
||||
# """
|
||||
#
|
||||
# def __init__(
|
||||
# self,
|
||||
# extract_method: ExtractTask._Type_Extract_Method,
|
||||
# document_type: ExtractTask._TypeDocumentType,
|
||||
# api_key: str,
|
||||
# ) -> None:
|
||||
# """
|
||||
# Setup Extract Task Config
|
||||
# """
|
||||
# super().__init__(extract_method, document_type)
|
||||
# self._api_key = api_key
|
||||
#
|
||||
# def to_dict(self) -> Dict:
|
||||
# """
|
||||
# Convert to a dict for submission to redis (fixme)
|
||||
# """
|
||||
# llama_parse_properties = {
|
||||
# "api_key": self._api_key,
|
||||
# }
|
||||
# task_desc = super().to_dict()
|
||||
# task_desc["task_properties"]["params"].update(llama_parse_properties)
|
||||
# return task_desc
|
||||
95
client/src/nv_ingest_client/primitives/tasks/task_factory.py
Normal file
95
client/src/nv_ingest_client/primitives/tasks/task_factory.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import inspect
|
||||
from typing import Callable
|
||||
from typing import Dict
|
||||
from typing import Type
|
||||
from typing import Union
|
||||
|
||||
from .caption import CaptionTask
|
||||
from .embed import EmbedTask
|
||||
from .extract import ExtractTask
|
||||
from .filter import FilterTask
|
||||
from .split import SplitTask
|
||||
from .store import StoreTask
|
||||
from .task_base import Task
|
||||
from .task_base import TaskType
|
||||
from .task_base import is_valid_task_type
|
||||
from .vdb_upload import VdbUploadTask
|
||||
|
||||
|
||||
class TaskUnimplemented(Task):
|
||||
"""
|
||||
Placeholder for unimplemented tasks
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__()
|
||||
raise NotImplementedError("Task type is not implemented")
|
||||
|
||||
|
||||
# Mapping of TaskType to Task classes, arranged alphabetically by task type
|
||||
_TASK_MAP: Dict[TaskType, Callable] = {
|
||||
TaskType.CAPTION: CaptionTask,
|
||||
TaskType.EMBED: EmbedTask,
|
||||
TaskType.EXTRACT: ExtractTask,
|
||||
TaskType.FILTER: FilterTask,
|
||||
TaskType.SPLIT: SplitTask,
|
||||
TaskType.STORE: StoreTask,
|
||||
TaskType.TRANSFORM: TaskUnimplemented,
|
||||
TaskType.VDB_UPLOAD: VdbUploadTask,
|
||||
}
|
||||
|
||||
|
||||
def task_factory(task_type: Union[TaskType, str], **kwargs) -> Task:
|
||||
"""
|
||||
Factory method for creating tasks based on the provided task type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
task_type : TaskType
|
||||
The type of the task to create.
|
||||
**kwargs : dict
|
||||
Additional keyword arguments to pass to the task's constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Task
|
||||
An instance of the task corresponding to the given task type.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If an invalid task type is provided.
|
||||
"""
|
||||
|
||||
if isinstance(task_type, str):
|
||||
if is_valid_task_type(task_type):
|
||||
task_type = TaskType[task_type]
|
||||
else:
|
||||
raise ValueError(f"Invalid task type string: '{task_type}'")
|
||||
elif not isinstance(task_type, TaskType):
|
||||
raise ValueError("task_type must be a TaskType enum member or a valid task type string")
|
||||
|
||||
task_class: Type[Task] = _TASK_MAP[task_type]
|
||||
|
||||
# Inspect the constructor (__init__) of the task class to get its parameters
|
||||
sig = inspect.signature(task_class.__init__)
|
||||
params = sig.parameters
|
||||
|
||||
# Exclude 'self' and positional-only parameters
|
||||
valid_kwargs = {
|
||||
name
|
||||
for name, param in params.items()
|
||||
if param.kind in [param.KEYWORD_ONLY, param.POSITIONAL_OR_KEYWORD] and name != "self"
|
||||
}
|
||||
|
||||
# Check if provided kwargs match the task's constructor parameters
|
||||
for kwarg in kwargs:
|
||||
if kwarg not in valid_kwargs:
|
||||
raise ValueError(f"Unexpected keyword argument '{kwarg}' for task type '{task_type.name}'")
|
||||
|
||||
# Create and return the task instance with the provided kwargs
|
||||
return task_class(**kwargs)
|
||||
59
client/src/nv_ingest_client/primitives/tasks/vdb_upload.py
Normal file
59
client/src/nv_ingest_client/primitives/tasks/vdb_upload.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
# pylint: disable=too-many-arguments
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .task_base import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VdbUploadTaskSchema(BaseModel):
|
||||
filter_errors: bool = False
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class VdbUploadTask(Task):
|
||||
"""
|
||||
Object for document embedding task
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filter_errors: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Setup VDB Upload Task Config
|
||||
"""
|
||||
super().__init__()
|
||||
self._filter_errors = filter_errors
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string with the object's config and run time state
|
||||
"""
|
||||
info = ""
|
||||
info += "VDB Upload Task:\n"
|
||||
info += f" filter_errors: {self._filter_errors}\n"
|
||||
return info
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""
|
||||
Convert to a dict for submission to redis
|
||||
"""
|
||||
|
||||
task_properties = {
|
||||
"filter_errors": self._filter_errors,
|
||||
}
|
||||
|
||||
return {"type": "vdb_upload", "task_properties": task_properties}
|
||||
0
client/src/nv_ingest_client/util/__init__.py
Normal file
0
client/src/nv_ingest_client/util/__init__.py
Normal file
155
client/src/nv_ingest_client/util/file_processing/extract.py
Normal file
155
client/src/nv_ingest_client/util/file_processing/extract.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
# pylint: disable=missing-class-docstring
|
||||
# pylint: disable=logging-fstring-interpolation
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import Tuple
|
||||
|
||||
import charset_normalizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Enums
|
||||
class DocumentTypeEnum(str, Enum):
|
||||
pdf = "pdf"
|
||||
txt = "text"
|
||||
docx = "docx"
|
||||
pptx = "pptx"
|
||||
jpeg = "jpeg"
|
||||
bmp = "bmp"
|
||||
png = "png"
|
||||
svg = "svg"
|
||||
html = "html"
|
||||
md = "md"
|
||||
|
||||
|
||||
# Maps MIME types to DocumentTypeEnum
|
||||
MIME_TO_DOCUMENT_TYPE = {
|
||||
"application/pdf": DocumentTypeEnum.pdf,
|
||||
"text/plain": DocumentTypeEnum.txt,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocumentTypeEnum.docx,
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": DocumentTypeEnum.pptx,
|
||||
"image/jpeg": DocumentTypeEnum.jpeg,
|
||||
"image/bmp": DocumentTypeEnum.bmp,
|
||||
"image/png": DocumentTypeEnum.png,
|
||||
"image/svg+xml": DocumentTypeEnum.svg,
|
||||
"text/html": DocumentTypeEnum.html,
|
||||
# Add more as needed
|
||||
}
|
||||
|
||||
# Maps file extensions to DocumentTypeEnum
|
||||
EXTENSION_TO_DOCUMENT_TYPE = {
|
||||
"pdf": DocumentTypeEnum.pdf,
|
||||
"txt": DocumentTypeEnum.txt,
|
||||
"docx": DocumentTypeEnum.docx,
|
||||
"pptx": DocumentTypeEnum.pptx,
|
||||
"jpg": DocumentTypeEnum.jpeg,
|
||||
"jpeg": DocumentTypeEnum.jpeg,
|
||||
"bmp": DocumentTypeEnum.bmp,
|
||||
"png": DocumentTypeEnum.png,
|
||||
"svg": DocumentTypeEnum.svg,
|
||||
"html": DocumentTypeEnum.html,
|
||||
"md": DocumentTypeEnum.txt,
|
||||
"sh": DocumentTypeEnum.txt,
|
||||
"json": DocumentTypeEnum.txt,
|
||||
# Add more as needed
|
||||
}
|
||||
|
||||
|
||||
def get_or_infer_file_type(file_path: str) -> DocumentTypeEnum:
|
||||
"""
|
||||
Determines the file type by inspecting its extension and optionally falls back
|
||||
to MIME type detection if the extension is not recognized.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_path : str
|
||||
The path to the file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DocumentTypeEnum
|
||||
An enum value representing the detected file type.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If a valid extension is not found and MIME type detection cannot determine a valid type.
|
||||
"""
|
||||
extension = os.path.splitext(file_path)[1][1:].lower()
|
||||
file_type = EXTENSION_TO_DOCUMENT_TYPE.get(extension)
|
||||
|
||||
# If the file extension maps to a known type, return it
|
||||
if file_type:
|
||||
return file_type
|
||||
|
||||
# TODO(Devin): libmagic is missing on the CI system, so we need to skip this check
|
||||
# If extension is not recognized, attempt MIME type detection as a fallback
|
||||
# mime_type = magic.from_file(file_path, mime=True)
|
||||
# # Attempt to map MIME type to DocumentTypeEnum, if possible
|
||||
# for mime, doc_type in MIME_TO_DOCUMENT_TYPE.items():
|
||||
# if mime_type == mime:
|
||||
# return doc_type
|
||||
|
||||
# If no valid file type is determined, raise an exception
|
||||
raise ValueError(f"Failed to determine file type for: {file_path}")
|
||||
|
||||
|
||||
def serialize_to_base64(file_stream: BytesIO) -> str:
|
||||
"""Reads a PDF file from a BytesIO object and encodes it in base64."""
|
||||
try:
|
||||
content = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
return content
|
||||
except IOError:
|
||||
logger.error("Failed to read PDF file from BytesIO object")
|
||||
raise
|
||||
|
||||
|
||||
def detect_encoding_and_read_text_file(file_stream: BytesIO) -> str:
|
||||
"""Detects encoding and reads a text file from a BytesIO object accordingly."""
|
||||
try:
|
||||
raw_data = file_stream.read(50000)
|
||||
file_stream.seek(0) # Reset stream position after reading
|
||||
result = charset_normalizer.detect(raw_data)
|
||||
encoding = result.get("encoding", "utf-8") # Fallback to utf-8 if undetected
|
||||
|
||||
content = file_stream.read().decode(encoding)
|
||||
return content
|
||||
except IOError:
|
||||
logger.error("Failed to read text file from BytesIO object")
|
||||
raise
|
||||
|
||||
|
||||
def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
|
||||
"""Extracts content from a file, supporting different formats."""
|
||||
document_type = get_or_infer_file_type(path)
|
||||
|
||||
with open(path, "rb") as file:
|
||||
file_stream = BytesIO(file.read())
|
||||
|
||||
try:
|
||||
if document_type in [
|
||||
DocumentTypeEnum.txt,
|
||||
DocumentTypeEnum.md,
|
||||
DocumentTypeEnum.html,
|
||||
]:
|
||||
content = detect_encoding_and_read_text_file(file_stream)
|
||||
else:
|
||||
content = serialize_to_base64(file_stream)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {path}: {e}")
|
||||
|
||||
raise ValueError(f"Failed to extract content from {path}") from e
|
||||
|
||||
logger.debug(f"Content extracted from '{path}'")
|
||||
return content, DocumentTypeEnum(document_type)
|
||||
195
client/src/nv_ingest_client/util/util.py
Normal file
195
client/src/nv_ingest_client/util/util.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
import typing
|
||||
from io import BytesIO
|
||||
from typing import Dict
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docx import Document as DocxDocument
|
||||
from nv_ingest_client.util.file_processing.extract import DocumentTypeEnum
|
||||
from nv_ingest_client.util.file_processing.extract import detect_encoding_and_read_text_file
|
||||
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
||||
from nv_ingest_client.util.file_processing.extract import get_or_infer_file_type
|
||||
from pptx import Presentation
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
# pylint: disable=missing-class-docstring
|
||||
# pylint: disable=logging-fstring-interpolation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def estimate_page_count(file_path: str) -> int:
|
||||
document_type = get_or_infer_file_type(file_path)
|
||||
|
||||
if document_type in [
|
||||
DocumentTypeEnum.pdf,
|
||||
DocumentTypeEnum.docx,
|
||||
DocumentTypeEnum.pptx,
|
||||
]:
|
||||
return count_pages_for_documents(file_path, document_type)
|
||||
elif document_type in [
|
||||
DocumentTypeEnum.txt,
|
||||
DocumentTypeEnum.md,
|
||||
DocumentTypeEnum.html,
|
||||
]:
|
||||
return count_pages_for_text(file_path)
|
||||
elif document_type in [
|
||||
DocumentTypeEnum.jpeg,
|
||||
DocumentTypeEnum.bmp,
|
||||
DocumentTypeEnum.png,
|
||||
DocumentTypeEnum.svg,
|
||||
]:
|
||||
return 1 # Image types assumed to be 1 page
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def count_pages_for_documents(file_path: str, document_type: DocumentTypeEnum) -> int:
|
||||
try:
|
||||
if document_type == DocumentTypeEnum.pdf:
|
||||
doc = pdfium.PdfDocument(file_path)
|
||||
return len(doc)
|
||||
elif document_type == DocumentTypeEnum.docx:
|
||||
doc = DocxDocument(file_path)
|
||||
# Approximation, as word documents do not have a direct 'page count' attribute
|
||||
return len(doc.paragraphs) // 15
|
||||
elif document_type == DocumentTypeEnum.pptx:
|
||||
ppt = Presentation(file_path)
|
||||
return len(ppt.slides)
|
||||
except FileNotFoundError:
|
||||
print(f"The file {file_path} was not found.")
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"An error occurred while processing {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def count_pages_for_text(file_path: str) -> int:
|
||||
"""
|
||||
Estimates the page count for text files based on word count,
|
||||
using the detect_encoding_and_read_text_file function for reading.
|
||||
"""
|
||||
try:
|
||||
with open(file_path, "rb") as file: # Open file in binary mode
|
||||
file_stream = BytesIO(file.read()) # Create BytesIO object from file content
|
||||
|
||||
content = detect_encoding_and_read_text_file(file_stream) # Read and decode content
|
||||
word_count = len(content.split())
|
||||
pages_estimated = word_count / 300
|
||||
return round(pages_estimated)
|
||||
except FileNotFoundError:
|
||||
logger.error(f"The file {file_path} was not found.")
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while processing {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def _process_file(file_path: str):
|
||||
"""
|
||||
Synchronously processes a single file, extracting its content and collecting file details.
|
||||
|
||||
This function serves as a high-level interface for file processing, invoking content
|
||||
extraction and aggregating the results along with file metadata. It is designed to work
|
||||
within a larger processing pipeline, providing necessary data for subsequent tasks or
|
||||
storage.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_path : str
|
||||
The path to the file that needs to be processed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary containing details about the processed file, including its name, a unique
|
||||
identifier, the extracted content, and the document type.
|
||||
|
||||
Raises
|
||||
------
|
||||
Exception
|
||||
Propagates any exceptions encountered during the file processing, signaling issues with
|
||||
content extraction or file handling.
|
||||
|
||||
Notes
|
||||
-----
|
||||
- The function directly utilizes `extract_file_content` for content extraction and performs
|
||||
basic error handling.
|
||||
- It constructs a simple metadata object that can be utilized for further processing or
|
||||
logging.
|
||||
"""
|
||||
|
||||
try:
|
||||
file_name = os.path.basename(file_path)
|
||||
content, document_type = extract_file_content(file_path) # Call the synchronous function directly
|
||||
|
||||
return {
|
||||
"source_name": file_name,
|
||||
"source_id": file_name,
|
||||
"content": content,
|
||||
"document_type": document_type,
|
||||
}
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
logger.error(f"Error processing file {file_path}: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def load_data_from_path(path: str) -> Dict:
|
||||
"""
|
||||
Loads data from a specified file path, preparing it for processing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
The path to the file from which data should be loaded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary containing keys 'file_name', 'id', 'content', and 'document_type',
|
||||
each of which maps to a list that includes the respective details for the processed file.
|
||||
|
||||
Raises
|
||||
------
|
||||
FileNotFoundError
|
||||
If the specified path does not exist.
|
||||
ValueError
|
||||
If the specified path is not a file.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function is designed to load and prepare file data for further processing,
|
||||
packaging the loaded data along with metadata such as file name and document type.
|
||||
"""
|
||||
|
||||
result = {"source_name": [], "source_id": [], "content": [], "document_type": []}
|
||||
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"The path {path} does not exist.")
|
||||
|
||||
if not os.path.isfile(path):
|
||||
raise ValueError("The provided path is not a file.")
|
||||
|
||||
file_data = _process_file(file_path=path)
|
||||
result["content"].append(file_data["content"])
|
||||
result["document_type"].append(file_data["document_type"])
|
||||
result["source_name"].append(file_data["source_name"])
|
||||
result["source_id"].append(file_data["source_id"])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_ingest_result(json_payload: Dict) -> typing.Tuple[bool, str]:
|
||||
# Check if the 'data' key exists and if 'status' within 'data' is 'failed'
|
||||
is_failed = json_payload.get("status", "") in "failed"
|
||||
description = json_payload.get("description", "")
|
||||
|
||||
return is_failed, description
|
||||
44
config/otel-collector-config.yaml
Normal file
44
config/otel-collector-config.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
|
||||
exporters:
|
||||
# NOTE: Prior to v0.86.0 use `logging` instead of `debug`.
|
||||
zipkin:
|
||||
endpoint: "http://zipkin:9411/api/v2/spans"
|
||||
logging:
|
||||
verbosity: detailed
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
|
||||
processors:
|
||||
batch:
|
||||
|
||||
extensions:
|
||||
health_check:
|
||||
zpages:
|
||||
|
||||
service:
|
||||
extensions: [zpages, health_check]
|
||||
telemetry:
|
||||
logs:
|
||||
level: "debug"
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [zipkin, logging]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [prometheus, logging]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [logging]
|
||||
11
config/prometheus.yaml
Normal file
11
config/prometheus.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "otel-collector"
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ["otel-collector:8889"]
|
||||
- targets: ["otel-collector:8888"]
|
||||
BIN
data/embedded_table.pdf
Normal file
BIN
data/embedded_table.pdf
Normal file
Binary file not shown.
4564
data/functional_validation.json
Normal file
4564
data/functional_validation.json
Normal file
File diff suppressed because one or more lines are too long
BIN
data/functional_validation.pdf
Normal file
BIN
data/functional_validation.pdf
Normal file
Binary file not shown.
BIN
data/multimodal_test.pdf
Normal file
BIN
data/multimodal_test.pdf
Normal file
Binary file not shown.
BIN
data/table_test.pdf
Normal file
BIN
data/table_test.pdf
Normal file
Binary file not shown.
BIN
data/test.pdf
Normal file
BIN
data/test.pdf
Normal file
Binary file not shown.
BIN
data/woods_frost.docx
Normal file
BIN
data/woods_frost.docx
Normal file
Binary file not shown.
BIN
data/woods_frost.pdf
Normal file
BIN
data/woods_frost.pdf
Normal file
Binary file not shown.
300
docker-compose.yaml
Normal file
300
docker-compose.yaml
Normal file
@@ -0,0 +1,300 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
redis:
|
||||
image: "redis/redis-stack"
|
||||
ports:
|
||||
- "6379:6379"
|
||||
|
||||
# tika:
|
||||
# image: apache/tika:latest
|
||||
# ports:
|
||||
# - "9998:9998"
|
||||
|
||||
yolox:
|
||||
image: #placeholder
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "8001:8001"
|
||||
- "8002:8002"
|
||||
volumes:
|
||||
- ${HOME}/.cache:/home/nvs/.cache
|
||||
user: root
|
||||
environment:
|
||||
- NIM_HTTP_API_PORT=8000
|
||||
- NIM_TRITON_LOG_VERBOSE=1
|
||||
- NGC_API_KEY=$NIM_NGC_API_KEY
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
deplot:
|
||||
image: #placeholder
|
||||
ports:
|
||||
- "8003:8000"
|
||||
- "8004:8001"
|
||||
- "8005:8002"
|
||||
volumes:
|
||||
- ${HOME}/.cache:/opt/nim/.cache
|
||||
user: root
|
||||
environment:
|
||||
- NIM_HTTP_API_PORT=8000
|
||||
- NIM_TRITON_LOG_VERBOSE=1
|
||||
- NGC_API_KEY=$NIM_NGC_API_KEY
|
||||
- CUDA_VISIBLE_DEVICES=1
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
cached:
|
||||
image: #placeholder
|
||||
shm_size: 2gb
|
||||
ports:
|
||||
- "8006:8000"
|
||||
- "8007:8001"
|
||||
- "8008:8002"
|
||||
volumes:
|
||||
- ${HOME}/.cache:/home/nvs/.cache
|
||||
user: root
|
||||
environment:
|
||||
- NIM_HTTP_API_PORT=8000
|
||||
- NIM_TRITON_LOG_VERBOSE=1
|
||||
- NGC_API_KEY=$NIM_NGC_API_KEY
|
||||
- CUDA_VISIBLE_DEVICES=2
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
paddle:
|
||||
image: #placeholder
|
||||
shm_size: 2gb
|
||||
ports:
|
||||
- "8009:8000"
|
||||
- "8010:8001"
|
||||
- "8011:8002"
|
||||
volumes:
|
||||
- ${HOME}/.cache:/home/nvs/.cache
|
||||
user: root
|
||||
environment:
|
||||
- NIM_HTTP_API_PORT=8000
|
||||
- NIM_TRITON_LOG_VERBOSE=1
|
||||
- NGC_API_KEY=$NIM_NGC_API_KEY
|
||||
- CUDA_VISIBLE_DEVICES=3
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
embedding:
|
||||
# NIM ON
|
||||
image: #placeholder
|
||||
shm_size: 16gb
|
||||
ports:
|
||||
- "8012:8000"
|
||||
- "8013:8001"
|
||||
- "8014:8002"
|
||||
environment:
|
||||
- NIM_HTTP_API_PORT=8000
|
||||
- NIM_TRITON_LOG_VERBOSE=1
|
||||
- NGC_API_KEY=$NIM_NGC_API_KEY
|
||||
- CUDA_VISIBLE_DEVICES=3
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
nv-ingest-ms-runtime:
|
||||
image: "nv-ingest:latest"
|
||||
build:
|
||||
context: ${NV_INGEST_ROOT}
|
||||
dockerfile: "./Dockerfile"
|
||||
target: runtime
|
||||
volumes:
|
||||
- ${DATASET_ROOT}:/workspace/data
|
||||
cap_add:
|
||||
- sys_nice
|
||||
environment:
|
||||
- CACHED_GRPC_ENDPOINT=cached:8001
|
||||
- CACHED_HTTP_ENDPOINT=""
|
||||
- DEPLOT_GRPC_ENDPOINT=""
|
||||
# build.nvidia.com hosted deplot
|
||||
#- DEPLOT_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/vlm/google/deplot
|
||||
# self hosted deplot
|
||||
- DEPLOT_HTTP_ENDPOINT=http://deplot:8000/v1/chat/completions
|
||||
- ECLAIR_GRPC_TRITON=triton-eclair:8001
|
||||
- INGEST_LOG_LEVEL=INFO
|
||||
- MESSAGE_CLIENT_HOST=redis
|
||||
- MESSAGE_CLIENT_PORT=6379
|
||||
- MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
|
||||
- NGC_API_KEY=${NGC_API_KEY:-ngcapikey}
|
||||
- NVIDIA_BUILD_API_KEY=${NVIDIA_BUILD_API_KEY:-nvidiabuildkey}
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
|
||||
- PADDLE_GRPC_ENDPOINT=paddle:8001
|
||||
- PADDLE_HTTP_ENDPOINT=""
|
||||
- REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
|
||||
- TABLE_DETECTION_GRPC_TRITON=yolox:8001
|
||||
- TABLE_DETECTION_HTTP_TRITON=""
|
||||
- YOLOX_GRPC_ENDPOINT=yolox:8001
|
||||
- YOLOX_HTTP_ENDPOINT=""
|
||||
- CUDA_VISIBLE_DEVICES=3
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:0.91.0
|
||||
hostname: otel-collector
|
||||
command: ["--config=/etc/otel-collector-config.yaml"]
|
||||
volumes:
|
||||
- ./config/otel-collector-config.yaml:/etc/otel-collector-config.yaml
|
||||
ports:
|
||||
- "8888:8888" # Prometheus metrics exposed by the collector
|
||||
- "8889:8889" # Prometheus exporter metrics
|
||||
- "13133:13133" # health_check extension
|
||||
- "9411" # Zipkin receiver
|
||||
- "4317:4317" # OTLP gRPC receiver
|
||||
- "4318:4318" # OTLP/HTTP receiver
|
||||
- "55680:55679" # zpages extension
|
||||
depends_on:
|
||||
- zipkin
|
||||
|
||||
zipkin:
|
||||
image: openzipkin/zipkin
|
||||
environment:
|
||||
JAVA_OPTS: "-Xms2g -Xmx2g -XX:+ExitOnOutOfMemoryError"
|
||||
ports:
|
||||
- "9411:9411" # Zipkin UI and API
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
command:
|
||||
- --web.console.templates=/etc/prometheus/consoles
|
||||
- --web.console.libraries=/etc/prometheus/console_libraries
|
||||
- --storage.tsdb.retention.time=1h
|
||||
- --config.file=/etc/prometheus/prometheus-config.yaml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --web.enable-lifecycle
|
||||
- --web.route-prefix=/
|
||||
- --enable-feature=exemplar-storage
|
||||
- --enable-feature=otlp-write-receiver
|
||||
volumes:
|
||||
- ./config/prometheus.yaml:/etc/prometheus/prometheus-config.yaml
|
||||
ports:
|
||||
- "9090:9090"
|
||||
|
||||
grafana:
|
||||
container_name: grafana-service
|
||||
image: grafana/grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
|
||||
# etcd:
|
||||
# # Turn on to leverage the `vdb_upload` task
|
||||
# restart: always
|
||||
# container_name: milvus-etcd
|
||||
# image: quay.io/coreos/etcd:v3.5.5
|
||||
# environment:
|
||||
# - ETCD_AUTO_COMPACTION_MODE=revision
|
||||
# - ETCD_AUTO_COMPACTION_RETENTION=1000
|
||||
# - ETCD_QUOTA_BACKEND_BYTES=4294967296
|
||||
# - ETCD_SNAPSHOT_COUNT=50000
|
||||
# volumes:
|
||||
# - ./.volumes/etcd:/etcd
|
||||
# command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
|
||||
# healthcheck:
|
||||
# test: ["CMD", "etcdctl", "endpoint", "health"]
|
||||
# interval: 30s
|
||||
# timeout: 20s
|
||||
# retries: 3
|
||||
|
||||
# minio:
|
||||
# # Turn on to leverage the `store` and `vdb_upload` task
|
||||
# restart: always
|
||||
# container_name: minio
|
||||
# hostname: minio
|
||||
# image: minio/minio:RELEASE.2023-03-20T20-16-18Z
|
||||
# environment:
|
||||
# MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin}
|
||||
# MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin}
|
||||
# ports:
|
||||
# - "9001:9001"
|
||||
# - "9000:9000"
|
||||
# volumes:
|
||||
# - ./.volumes/minio:/minio_data
|
||||
# command: minio server /minio_data --console-address ":9001"
|
||||
# healthcheck:
|
||||
# test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
# interval: 30s
|
||||
# timeout: 20s
|
||||
# retries: 3
|
||||
|
||||
# milvus:
|
||||
# # Turn on to leverage the `vdb_upload` task
|
||||
# restart: always
|
||||
# container_name: milvus-standalone
|
||||
# image: milvusdb/milvus:v2.3.5
|
||||
# command: ["milvus", "run", "standalone"]
|
||||
# hostname: milvus
|
||||
# security_opt:
|
||||
# - seccomp:unconfined
|
||||
# environment:
|
||||
# ETCD_ENDPOINTS: etcd:2379
|
||||
# MINIO_ADDRESS: minio:9000
|
||||
# volumes:
|
||||
# - ./.volumes/milvus:/var/lib/milvus
|
||||
# healthcheck:
|
||||
# test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
|
||||
# interval: 30s
|
||||
# start_period: 90s
|
||||
# timeout: 20s
|
||||
# retries: 3
|
||||
# ports:
|
||||
# - "19530:19530"
|
||||
# - "9091:9091"
|
||||
# depends_on:
|
||||
# - "etcd"
|
||||
# - "minio"
|
||||
|
||||
# attu:
|
||||
# # Turn on to leverage the `vdb_upload` task
|
||||
# restart: always
|
||||
# container_name: milvus-attu
|
||||
# image: zilliz/attu:v2.3.5
|
||||
# hostname: attu
|
||||
# environment:
|
||||
# MILVUS_URL: http://milvus:19530
|
||||
# ports:
|
||||
# - "3001:3000"
|
||||
# depends_on:
|
||||
# - "milvus"
|
||||
17
docker/scripts/entrypoint_source_ext.sh
Normal file
17
docker/scripts/entrypoint_source_ext.sh
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/sh
|
||||
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
set -e
|
||||
|
||||
# Run preparation tasks here
|
||||
if [ "$INSTALL_ADOBE_SDK" = "true" ]; then
|
||||
echo "Checking if Adobe PDF Services SDK is installed..."
|
||||
|
||||
# Check if pdfservices-sdk is installed
|
||||
if ! python -c "import pkg_resources; pkg_resources.require('pdfservices-sdk~=4.0.0')" 2>/dev/null; then
|
||||
echo "Installing Adobe PDF Services SDK..."
|
||||
pip install "pdfservices-sdk~=4.0.0"
|
||||
fi
|
||||
fi
|
||||
43
docs/content-metadata.md
Normal file
43
docs/content-metadata.md
Normal file
@@ -0,0 +1,43 @@
|
||||
**Definitions**:
|
||||
Source: The knowledge base file from which content and metadata is extracted
|
||||
Content: Data extracted from a source; generally Text or Image
|
||||
Metadata: Descriptive data which can be associated with Sources, Content(Image or Text); metadata can be extracted from Source/Content, or generated using models, heuristics, etc
|
||||
|
||||
| | Field | Description | Method |
|
||||
| ----- | :---- | :---- | :---- |
|
||||
| Content | Content | Content extracted from Source | Extracted |
|
||||
| Source Metadata | Source Name | Name of source | Extracted |
|
||||
| | Source ID | ID of source | Extracted |
|
||||
| | Source location | URL, URI, pointer to storage location | ? |
|
||||
| | Source Type | PDF, HTML, Docx, TXT, PPTx | Extracted |
|
||||
| | Collection ID | Collection in which the source is contained | ? |
|
||||
| | Date Created | Date source was created | Extracted | ? |
|
||||
| | Last Modified | Date source was last modified | Extracted | ? |
|
||||
| | Summary | Summarization of Source Doc | Generated | Pending Research |
|
||||
| | Partition ID | Offset of this data fragment within a larger set of fragments | Generated |
|
||||
| | Access Level | Dictates RBAC | ? | N |
|
||||
| Content Metadata (applicable to all content types) | Type | Text, Image, Structured, Table, Chart | Generated |
|
||||
| | Description | Text Description of the content object (Image/Table) | Generated |
|
||||
| | Page \# | Page \# where content is contained in source | Extracted |
|
||||
| | Hierarchy | Location/order of content within the source document | Extracted |
|
||||
| | Subtype | For structured data subtypes \- table, chart, etc.. | | |
|
||||
| Text Metadata | Text Type | Header, body, etc | Extracted |
|
||||
| | Summary | Abbreviated Summary of content | Generated | Pending Research |
|
||||
| | Keywords | Keywords, Named Entities, or other phrases | Extracted | N |
|
||||
| | Language | | Generated | N |
|
||||
| Image Metadata | Image Type | Structured, Natural,Hybrid, etc | Generated (Classifier) | Y(needs to be developed) |
|
||||
| | Structured Image Type | Bar Chart, Pie Chart, etc | Generated (Classifier) | Y(needs to be developed) |
|
||||
| | Caption | Any caption or subheader associated with Image | Extracted |
|
||||
| | Text | Extracted text from a structured chart | Extracted | Pending Research |
|
||||
| | Image location | Location (x,y) of chart within an image | Extracted | |
|
||||
| | uploaded\_image\_uri | Mirrors source\_metadata.source\_location | | |
|
||||
| Table Metadata (tables within documents) | Table format | Structured (dataframe / lists of rows and columns), or serialized as markdown, html, latex, simple (cells separated just as spaces) | Extracted |
|
||||
| | Table content | Extracted text content, formatted according to table\_metadata.table\_format. Important: Tables should not be chunked | Extracted | |
|
||||
| | Table location | Bounding box of the table | Extracted | |
|
||||
| | Caption | Detected captions for the table/chart | Extracted | |
|
||||
| | Title | TODO | Extracted | |
|
||||
| | Subtitle | TODO | Extracted | |
|
||||
| | Axis | TODO | Extracted | |
|
||||
| | uploaded\_image\_uri | Mirrors source\_metadata.source\_location | Generated | |
|
||||
| Chart Metadata | TODO | | | |
|
||||
|
||||
47
docs/deployment.md
Normal file
47
docs/deployment.md
Normal file
@@ -0,0 +1,47 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
### Launch nv-ingest micro-service(s)
|
||||
|
||||
```bash
|
||||
# Redis is our message broker for the ingest service, always required.
|
||||
docker compose up -d redis
|
||||
|
||||
# `yolox`, `deplot`, `cached`, and `paddle` are NIMs used to perform table and chart extraction.
|
||||
docker compose up -d yolox deplot cached paddle
|
||||
|
||||
# Optional (MinIO) is an opject store to store extracted images, tables, and charts, by default it is commented out in the docker compose file.
|
||||
# The `store` task will not be functional without this service or external s3 compliant object store.
|
||||
docker compose up -d minio
|
||||
|
||||
# Optional (Milvus) is a vector database to embeddings for multi-model extractions, by default it is commented out in the docker compose file.
|
||||
# The `vdb_upload` task will not be functional without this serivce or external Milvus database.
|
||||
docker compose up -d etcd minio milvus attu
|
||||
|
||||
# Optional (Telemetry services)
|
||||
# TODO: Add examples for telemetry services
|
||||
docker compose up -d otel-collector prometheus grafana zipkin
|
||||
|
||||
# Optional (Embedding NIM) Stand up `nv-embedqa-e5-v5` NIM to calculate embeddings for extracted content.
|
||||
# The `embed` task will not be functional without this service.
|
||||
docker compose up -d embedding
|
||||
|
||||
# Optional (Triton) See below for Triton setup we need Triton for any model inference
|
||||
# This is only needed for captioning or ECLAIR based extraction.
|
||||
docker compose up -d triton
|
||||
|
||||
# Ingest service
|
||||
docker compose up -d nv-ingest-ms-runtime
|
||||
```
|
||||
|
||||
You should see something like this:
|
||||
|
||||
```bash
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
6065c12d6034 .../nv-ingest:2024.6.3.dev0 "/opt/conda/bin/tini…" 6 hours ago Up 6 hours nv-ingest-ms-runtime-1
|
||||
c1f1f6b9cc8c .../tritonserver:24.05-py3 "/opt/nvidia/nvidia_…" 5 days ago Up 8 hours 0.0.0.0:8000-8002->8000-8002/tcp devin-nv-ingest-triton-1
|
||||
d277cf2c2703 redis/redis-stack "/entrypoint.sh" 2 weeks ago Up 8 hours 0.0.0.0:6379->6379/tcp, 8001/tcp devin-nv-ingest-redis-1
|
||||
```
|
||||
10
docs/dev/triton_models.md
Normal file
10
docs/dev/triton_models.md
Normal file
@@ -0,0 +1,10 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
### Create Triton model
|
||||
|
||||
By default, NV-Ingest does not require Triton, but if you are testing tasks that require Triton, you will need to create a Triton container and or models
|
||||
for the tasks you are testing.
|
||||
69
docs/environment-config.md
Normal file
69
docs/environment-config.md
Normal file
@@ -0,0 +1,69 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
### **Environment Configuration Variables**
|
||||
|
||||
- **`MESSAGE_CLIENT_HOST`**:
|
||||
|
||||
- **Description**: Specifies the hostname or IP address of the message broker used for communication between
|
||||
services.
|
||||
- **Example**: `redis`, `localhost`, `192.168.1.10`
|
||||
|
||||
- **`MESSAGE_CLIENT_PORT`**:
|
||||
|
||||
- **Description**: Specifies the port number on which the message broker is listening.
|
||||
- **Example**: `6379`, `5672`
|
||||
|
||||
- **`CAPTION_CLASSIFIER_GRPC_TRITON`**:
|
||||
|
||||
- **Description**: The endpoint where the caption classifier model is hosted using gRPC for communication. This is
|
||||
used to send requests for caption classification.
|
||||
You must specify only ONE of an http or gRPC endpoint. If both are specified gRPC will take precedence.
|
||||
- **Example**: `triton:8001`
|
||||
|
||||
- **`CAPTION_CLASSIFIER_MODEL_NAME`**:
|
||||
|
||||
- **Description**: The name of the caption classifier model.
|
||||
- **Example**: `deberta_large`
|
||||
|
||||
- **`REDIS_MORPHEUS_TASK_QUEUE`**:
|
||||
|
||||
- **Description**: The name of the task queue in Redis where tasks are stored and processed.
|
||||
- **Example**: `morpheus_task_queue`
|
||||
|
||||
- **`ECLAIR_TRITON_HOST`**:
|
||||
|
||||
- **Description**: The hostname or IP address of the ECLAIR model service.
|
||||
- **Example**: `triton-eclair`
|
||||
|
||||
- **`ECLAIR_TRITON_PORT`**:
|
||||
|
||||
- **Description**: The port number on which the ECLAIR model service is listening.
|
||||
- **Example**: `8001`
|
||||
|
||||
- **`OTEL_EXPORTER_OTLP_ENDPOINT`**:
|
||||
|
||||
- **Description**: The endpoint for the OpenTelemetry exporter, used for sending telemetry data.
|
||||
- **Example**: `http://otel-collector:4317`
|
||||
|
||||
- **`NGC_API_KEY`**:
|
||||
|
||||
- **Description**: An authorized NGC API key, used to interact with hosted NIMs and can be generated here: https://org.ngc.nvidia.com/setup/personal-keys.
|
||||
- **Example**: `hFFVc4XzxR***********WUzKYOCtZE`
|
||||
|
||||
- **`MINIO_BUCKET`**:
|
||||
|
||||
- **Description**: Name of MinIO bucket, used to store image, table, and chart extractions.
|
||||
- **Example**: `nv-ingest`
|
||||
|
||||
- **`INGEST_LOG_LEVEL`**:
|
||||
|
||||
- **Description**: The log level for the ingest service, which controls the verbosity of the logging output.
|
||||
- **Example**: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`
|
||||
|
||||
- **`NVIDIA_BUILD_API_KEY`**:
|
||||
|
||||
- **Description**: This is required if you are using the build.nvidia.com endpoint instead of a self hosted Deplot NIM
|
||||
BIN
docs/images/eclair_batch_size.png
Normal file
BIN
docs/images/eclair_batch_size.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.6 KiB |
BIN
docs/images/image_viewer_example.png
Normal file
BIN
docs/images/image_viewer_example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 158 KiB |
BIN
docs/images/prometheus.png
Normal file
BIN
docs/images/prometheus.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 63 KiB |
BIN
docs/images/test.pdf.png
Normal file
BIN
docs/images/test.pdf.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 361 KiB |
BIN
docs/images/zipkin.png
Normal file
BIN
docs/images/zipkin.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 153 KiB |
389
docs/kubernetes-dev.md
Normal file
389
docs/kubernetes-dev.md
Normal file
@@ -0,0 +1,389 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
# Developing with Kubernetes
|
||||
|
||||
Developing directly on Kubernetes gives us more confidence that things will work as expected in end user deployments.
|
||||
|
||||
This page describes how to use Kubernetes generally, and how to deploy nv-ingest on a local Kubernetes clusters.
|
||||
|
||||
> **NOTE:** _Unless otherwise noted, all commands below should be run from the root of this repo._
|
||||
|
||||
## Kubernetes cluster
|
||||
|
||||
To start you need a Kubernetes cluster.
|
||||
To get started we recommend using `kind` which creates a single Docker container with a Kubernetes cluster inside it.
|
||||
|
||||
Also because this the `kind` cluster needs access to the GPUs on your system you need to install `kind-with-gpus`. The easiest way to do this is following the instructions laid out in this Github repo https://github.com/klueska/kind-with-gpus-examples/tree/master
|
||||
|
||||
Benefits of this:
|
||||
|
||||
- allows many developers on the same system to have isolated Kubernetes clusters
|
||||
- enables easy creation/deletion of clusters
|
||||
|
||||
Run the following **from the root of the repo** to create a configuration file for your cluster.
|
||||
|
||||
```yaml
|
||||
mkdir -p ./.tmp
|
||||
|
||||
cat <<EOF > ./.tmp/kind-config.yaml
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
name: nv-ingest-${USER}
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.29.2
|
||||
{{- range \$gpu := until numGPUs }}
|
||||
- role: worker
|
||||
extraMounts:
|
||||
# We inject all NVIDIA GPUs using the nvidia-container-runtime.
|
||||
# This requires 'accept-nvidia-visible-devices-as-volume-mounts = true' be set
|
||||
# in '/etc/nvidia-container-runtime/config.toml'
|
||||
- hostPath: /dev/null
|
||||
containerPath: /var/run/nvidia-container-devices/{{ \$gpu }}
|
||||
{{- end }}
|
||||
EOF
|
||||
```
|
||||
|
||||
Then use the `nvkind` CLI to create your cluster.
|
||||
|
||||
```shell
|
||||
nvkind cluster create \
|
||||
--config-template ./.tmp/kind-config.yaml
|
||||
```
|
||||
|
||||
You should see output like this:
|
||||
|
||||
```shell
|
||||
Creating cluster "jdyer" ...
|
||||
✓ Ensuring node image (kindest/node:v1.27.11) 🖼
|
||||
✓ Preparing nodes 📦
|
||||
✓ Writing configuration 📜
|
||||
✓ Starting control-plane 🕹️
|
||||
✓ Installing CNI 🔌
|
||||
✓ Installing StorageClass 💾
|
||||
Set kubectl context to "kind-jdyer"
|
||||
You can now use your cluster with:
|
||||
|
||||
kubectl cluster-info --context kind-jdyer
|
||||
|
||||
Have a nice day! 👋
|
||||
```
|
||||
|
||||
You can list clusters on the system with `kind get clusters`.
|
||||
|
||||
```shell
|
||||
kind get clusters
|
||||
# jdyer
|
||||
```
|
||||
|
||||
You can also just use `docker ps` to see the kind container.
|
||||
|
||||
```shell
|
||||
docker ps | grep kind
|
||||
# aaf5216a3cc8 kindest/node:v1.27.11 "/usr/local/bin/entr…" 44 seconds ago Up 42 seconds 127.0.0.1:45099->6443/tcp jdyer-control-plane
|
||||
```
|
||||
|
||||
`kind create cluster` will do the following:
|
||||
|
||||
- add a context for this cluster to `${HOME}/.kube/config`, the default config file used by tools like `kubectl`
|
||||
- change the default context to that one
|
||||
|
||||
You should be able to use `kubectl` immediately, and it should be pointed at that cluster you just created.
|
||||
|
||||
For example, to ensure the cluster was set up successfully, try listing nodes.
|
||||
|
||||
```shell
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
If that worked, you should see a single node, like this:
|
||||
|
||||
```text
|
||||
NAME STATUS ROLES AGE VERSION
|
||||
jdyer-control-plane Ready control-plane 63s v1.27.11
|
||||
```
|
||||
|
||||
Note: All of the containers created inside your Kubernetes cluster will not show up when you run `docker ps` as they are nested within a separate containerd namespace.
|
||||
|
||||
See "debugging tools" in the "Troubleshooting" section below.
|
||||
|
||||
## Skaffold
|
||||
|
||||
Now that you have a Kubernetes cluster you can use `skaffold` to build and deploy your development environment.
|
||||
|
||||
Skaffold does a few things for you in a single command:
|
||||
|
||||
- Build containers from the current directory (via `docker build`).
|
||||
- Install the retriever-ingest helm charts (via `helm install`).
|
||||
- Apply additional Kubernetes manifests (via `kustomize`).
|
||||
- Hot reloading - skaffold watches your local directory for changes and syncs them into the Kubernetes container.
|
||||
- _for details on this, see "Hot reloading" below ([link](#hot-reloading))_
|
||||
- Port forwards the -ingest service to the host.
|
||||
|
||||
### Directory structure
|
||||
|
||||
- `skaffold/sensitive/` contains any secrets or manifests you want deployed to your cluster, but not checked into git, as your local cluster is unlikely to have ESO installed. If it does, feel free to use `kind: ExternalSecret` instead.
|
||||
- `skaffold/components` contains any k8s manifests you want deployed in any skaffold file. The paths are relative and can be used in either `kustomize` or `rawYaml` formats:
|
||||
|
||||
```yaml
|
||||
manifests:
|
||||
rawYaml:
|
||||
- sensitive/*.yaml
|
||||
kustomize:
|
||||
paths:
|
||||
- components/elasticsearch
|
||||
```
|
||||
|
||||
- If adding a new service, try getting a helm object first. If none exists, you may have to encapsulate it with your k8s manifests in `skaffold/components`. We are a k8s shop, so manifest writing may be required from time to time.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
#### Add Helm repos
|
||||
|
||||
The retriever-ingest service's deployment requires pulling in configurations for other services from third-party sources,
|
||||
e.g. Elasticsearch, OpenTelemetry, and Postgres.
|
||||
|
||||
The first time you try to deploy this project to a local Kubernetes, you may need to tell
|
||||
your local version of `Helm` (a package manager for Kubernetes configurations) where to find those
|
||||
third-party things, by running something like the following.
|
||||
|
||||
```shell
|
||||
helm repo add \
|
||||
nvdp \
|
||||
https://nvidia.github.io/k8s-device-plugin
|
||||
|
||||
helm repo add \
|
||||
zipkin \
|
||||
https://zipkin.io/zipkin-helm
|
||||
|
||||
helm repo add \
|
||||
opentelemetry \
|
||||
https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||
|
||||
helm repo add \
|
||||
nvidia \
|
||||
https://helm.ngc.nvidia.com/nvidia
|
||||
|
||||
helm repo add \
|
||||
bitnami \
|
||||
https://charts.bitnami.com/bitnami
|
||||
```
|
||||
|
||||
For the full list of repositories, see the `dependencies` section in [this project's Chart.yaml](../../helm/Chart.yaml).
|
||||
|
||||
#### Nvidia GPU Support
|
||||
|
||||
In order for the deployed kubernetes pods to access the Nvidia GPU resources the [Nvidia k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) must be installed. There are a multitude of configurations for this plugin but for a straight forward route to start development you can simply run.
|
||||
|
||||
```shell
|
||||
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml
|
||||
```
|
||||
|
||||
#### Create an image pull secret
|
||||
|
||||
You'll also need to provide a Kubernetes Secret with credentials to pull NVIDIA-private Docker images.
|
||||
|
||||
For short-lived development clusters, just use your own individual credentials.
|
||||
|
||||
```shell
|
||||
DOCKER_CONFIG_JSON=$(
|
||||
cat "${HOME}/.docker/config.json" \
|
||||
| base64 -w 0
|
||||
)
|
||||
|
||||
cat <<EOF > ./skaffold/sensitive/imagepull.yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: nvcrimagepullsecret
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
|
||||
EOF
|
||||
```
|
||||
|
||||
An NGC personal API key is needed to access models and images hosted on NGC.
|
||||
Make sure that you have followed the steps of _[Ensure you have access to NGC](./index.md#ensure-you-have-access-to-ngc)_.
|
||||
Next, store the key in an environment variable:
|
||||
|
||||
```shell
|
||||
export NGC_API_KEY="<YOUR_KEY_HERE>"
|
||||
```
|
||||
|
||||
Then create the secret manifest with:
|
||||
|
||||
```shell
|
||||
kubectl create secret generic ngcapisecrets \
|
||||
--from-literal=ngc_api_key="${NGC_API_KEY}" \
|
||||
--dry-run=client -o yaml \
|
||||
> skaffold/sensitive/ngcapi.yaml
|
||||
```
|
||||
|
||||
### Deploy the service
|
||||
|
||||
Run the following to deploy the retriever-ingest to your cluster.
|
||||
|
||||
```shell
|
||||
skaffold dev \
|
||||
-v info \
|
||||
-f ./skaffold/nv-ingest.skaffold.yaml \
|
||||
--kube-context "kind-nv-ingest-${USER}"
|
||||
```
|
||||
|
||||
<details><summary>explanation of those flags (click me)</summary>
|
||||
|
||||
- `-v info` = print INFO-level and above logs from `skaffold` and the tools it calls (like `helm` or `kustomize`)
|
||||
- `-f ./skaffold/nv-ingest.skaffold.yaml` = use configuration specific to retriever-ingest
|
||||
- `--tail=false` = don't flood your console with all the logs from the deployed containers
|
||||
- `--kube-context "kind-${USER}"` = target the specific Kubernetes cluster you created with `kind` above
|
||||
|
||||
</details>
|
||||
|
||||
`skaffold dev` watches your local files and automatically redeploys the app as you change those files.
|
||||
It also holds control in the terminal you run it in, and handles shutting down the pods in Kubernetes when you `Ctrl + C` out of it.
|
||||
|
||||
You should see output similar to this:
|
||||
|
||||
```shell
|
||||
Generating tags...
|
||||
- ...
|
||||
Checking cache...
|
||||
- ...
|
||||
Tags used in deployment:
|
||||
- ...
|
||||
Starting deploy...
|
||||
Loading images into kind cluster nodes...
|
||||
- ...
|
||||
Waiting for deployments to stabilize...
|
||||
Deployments stabilized in 23.08 seconds
|
||||
Watching for changes...
|
||||
```
|
||||
|
||||
When you run this command, `skaffold dev` finds a random open port on the system and exposes the retriever-ingest service on that port ([skaffold docs](https://skaffold.dev/docs/port-forwarding/)).
|
||||
|
||||
You can find that port in `skaffold`'s logs, in a statement like this:
|
||||
|
||||
```bash
|
||||
Port forwarding Service/nv-ingest in namespace , remote port http -> http://0.0.0.0:4503
|
||||
```
|
||||
|
||||
Alternatively, you can obtain it like this:
|
||||
|
||||
```shell
|
||||
NV_INGEST_MS_PORT=$(
|
||||
ps aux \
|
||||
| grep -E "kind\-${USER} port-forward .*Service/nv-ingest" \
|
||||
| grep -o -E '[0-9]+:http' \
|
||||
| cut -d ':' -f1
|
||||
)
|
||||
```
|
||||
|
||||
To confirm that the service is deployed and working, issue a request against the port you set up port-forwarding to above.
|
||||
|
||||
```shell
|
||||
API_HOST="http://localhost:${NV_INGEST_MS_PORT}"
|
||||
|
||||
curl \
|
||||
-i \
|
||||
-X GET \
|
||||
"${API_HOST}/health"
|
||||
```
|
||||
|
||||
Additionally, running `skaffold verify` in a new terminal will run verification tests against the service (i.e. [integration tests](https://skaffold.dev/docs/verify/)). These are very lightweight health checks, and should not be confused with actual integration tests.
|
||||
|
||||
## Clean Up
|
||||
|
||||
To destroy the entire Kubernetes cluster, run the following.
|
||||
|
||||
```shell
|
||||
kind delete cluster \
|
||||
--name "${USER}"
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Debugging Tools
|
||||
|
||||
`kubectl` is the official CLI for Kubernetes, and supports a lot of useful functionality.
|
||||
|
||||
For example, to get a shell inside the `nv-ingest-ms-runtime` container in your deployment, run the following:
|
||||
|
||||
```shell
|
||||
NV_INGEST_POD=$(
|
||||
kubectl get pods \
|
||||
--context "kind-${USER}" \
|
||||
--namespace default \
|
||||
-l 'app.kubernetes.io/instance=nv-ingest-ms-runtime' \
|
||||
--no-headers \
|
||||
| awk '{print $1}'
|
||||
)
|
||||
kubectl exec \
|
||||
--context "kind-${USER}" \
|
||||
--namespace default \
|
||||
pod/${NV_INGEST_POD} \
|
||||
-i \
|
||||
-t \
|
||||
-- sh
|
||||
```
|
||||
|
||||
For an interactive, live-updating experience, try `k9s`.
|
||||
To launch it, just run `k9s`.
|
||||
|
||||
```shell
|
||||
k9s
|
||||
```
|
||||
|
||||
You should see something like the following.
|
||||
|
||||
{width=80%}
|
||||
|
||||
For details on how to use it, see https://k9scli.io/topics/commands/.
|
||||
|
||||
### Installing Helm Repositories
|
||||
|
||||
You may encounter an error like this:
|
||||
|
||||
> _Error: no repository definition for https://helm.dask.org. Please add the missing repos via 'helm repo add'_
|
||||
|
||||
That indicates that your local installation of `Helm` (sort of a package manager for Kubernetes configurations) doesn't know
|
||||
how to access a remote repository containing Kubernetes configurations.
|
||||
|
||||
As that error message says, run `help repo add` with that URL and an informative name.
|
||||
|
||||
```shell
|
||||
helm repo add \
|
||||
bitnami \
|
||||
https://charts.bitnami.com/bitnami
|
||||
|
||||
helm repo add \
|
||||
tika \
|
||||
https://apache.jfrog.io/artifactory/tika
|
||||
```
|
||||
|
||||
### Getting more logs from `skaffold`
|
||||
|
||||
You may encounter an error like this:
|
||||
|
||||
```shell
|
||||
Generating tags...
|
||||
- retrieval-ms -> retrieval-ms:f181a78-dirty
|
||||
Checking cache...
|
||||
- retrieval-ms: Found Locally
|
||||
Cleaning up...
|
||||
- No resources found
|
||||
building helm dependencies: exit status 1
|
||||
```
|
||||
|
||||
Seeing only "building helm dependencies" likely means you ran `skaffold dev` or `skaffold run` in a fairly quiet mode.
|
||||
|
||||
Re-run those commands with something like `-v info` or `-v debug` to get more information about what specifically failed.
|
||||
|
||||
## References
|
||||
|
||||
- Helm quickstart: https://helm.sh/docs/intro/quickstart/
|
||||
- `kind` docs: https://kind.sigs.k8s.io/
|
||||
- `skaffold` docs: https://skaffold.dev/docs/
|
||||
247
docs/nv-ingest_cli.md
Normal file
247
docs/nv-ingest_cli.md
Normal file
@@ -0,0 +1,247 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
After installing the Python dependencies, you'll be able to use the nv-ingest-cli tool.
|
||||
|
||||
```bash
|
||||
nv-ingest-cli --help
|
||||
Usage: nv-ingest-cli [OPTIONS]
|
||||
|
||||
Options:
|
||||
--batch_size INTEGER Batch size (must be >= 1). [default: 10]
|
||||
--doc PATH Add a new document to be processed (supports
|
||||
multiple).
|
||||
--dataset PATH Path to a dataset definition file.
|
||||
--client [REST|REDIS|KAFKA] Client type. [default: REDIS]
|
||||
--client_host TEXT DNS name or URL for the endpoint.
|
||||
--client_port INTEGER Port for the client endpoint.
|
||||
--client_kwargs TEXT Additional arguments to pass to the client.
|
||||
--concurrency_n INTEGER Number of inflight jobs to maintain at one
|
||||
time. [default: 10]
|
||||
--document_processing_timeout INTEGER
|
||||
Timeout when waiting for a document to be
|
||||
processed. [default: 10]
|
||||
--dry_run Perform a dry run without executing actions.
|
||||
--output_directory PATH Output directory for results.
|
||||
--log_level [DEBUG|INFO|WARNING|ERROR|CRITICAL]
|
||||
Log level. [default: INFO]
|
||||
--shuffle_dataset Shuffle the dataset before processing.
|
||||
[default: True]
|
||||
--task TEXT Task definitions in JSON format, allowing multiple tasks to be configured by repeating this option.
|
||||
Each task must be specified with its type and corresponding options in the '[task_id]:{json_options}' format.
|
||||
|
||||
Example:
|
||||
--task 'split:{"split_by":"page", "split_length":10}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_text":true}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_method":"eclair"}'
|
||||
--task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
|
||||
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
||||
--task 'store:{"content_type":"image", "store_method":"minio", "endpoint":"minio:9000"}'
|
||||
--task 'store:{"content_type":"image", "store_method":"minio", "endpoint":"minio:9000", "text_depth": "page"}'
|
||||
--task 'caption:{}'
|
||||
|
||||
Tasks and Options:
|
||||
- split: Divides documents according to specified criteria.
|
||||
Options:
|
||||
- split_by (str): Criteria ('page', 'size', 'word', 'sentence'). No default.
|
||||
- split_length (int): Segment length. No default.
|
||||
- split_overlap (int): Segment overlap. No default.
|
||||
- max_character_length (int): Maximum segment character count. No default.
|
||||
- sentence_window_size (int): Sentence window size. No default.
|
||||
|
||||
- extract: Extracts content from documents, customizable per document type.
|
||||
Can be specified multiple times for different 'document_type' values.
|
||||
Options:
|
||||
- document_type (str): Document format ('pdf', 'docx', 'pptx', 'html', 'xml', 'excel', 'csv', 'parquet'). Required.
|
||||
- text_depth (str): Depth at which text parsing occurs ('document', 'page'), additional text_depths are partially supported and depend on the specified extraction method ('block', 'line', 'span')
|
||||
- extract_method (str): Extraction technique. Defaults are smartly chosen based on 'document_type'.
|
||||
- extract_text (bool): Enables text extraction. Default: False.
|
||||
- extract_images (bool): Enables image extraction. Default: False.
|
||||
- extract_tables (bool): Enables table extraction. Default: False.
|
||||
|
||||
- store: Stores any images extracted from documents.
|
||||
Options:
|
||||
- structured (bool): Flag to write extracted charts and tables to object store. Default: True.
|
||||
- images (bool): Flag to write extracted images to object store. Default: False.
|
||||
- store_method (str): Storage type ('minio', ). Required.
|
||||
|
||||
- caption: Attempts to extract captions for images extracted from documents. Note: this is not generative, but rather a
|
||||
simple extraction.
|
||||
Options:
|
||||
N/A
|
||||
|
||||
- dedup: Idenfities and optionally filters duplicate images in extraction.
|
||||
Options:
|
||||
- content_type (str): Content type to deduplicate ('image')
|
||||
- filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
|
||||
|
||||
- filter: Idenfities and optionally filters images above or below scale thresholds.
|
||||
Options:
|
||||
- content_type (str): Content type to deduplicate ('image')
|
||||
- min_size: (Union[float, int]): Minimum allowable size of extracted image.
|
||||
- max_aspect_ratio: (Union[float, int]): Maximum allowable aspect ratio of extracted image.
|
||||
- min_aspect_ratio: (Union[float, int]): Minimum allowable aspect ratio of extracted image.
|
||||
- filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
|
||||
|
||||
Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
|
||||
--version Show version.
|
||||
--help Show this message and exit.
|
||||
|
||||
```
|
||||
|
||||
### Example document submission to the nv-ingest-ms-runtime service
|
||||
|
||||
Each of the following can be run from the host machine or from within the nv-ingest-ms-runtime container.
|
||||
|
||||
- Host: `nv-ingest-cli ...`
|
||||
- Container: `nv-ingest-cli ...`
|
||||
|
||||
Submit a text file, with no splitting.
|
||||
|
||||
**Note:** You will receive a response containing a single document, which is the entire text file -- This is mostly
|
||||
a NO-OP, but the returned data will be wrapped in the appropriate metadata structure.
|
||||
|
||||
```bash
|
||||
nv-ingest-cli \
|
||||
--doc ./data/test.pdf \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
```
|
||||
|
||||
Submit a PDF file with only a splitting task.
|
||||
|
||||
```bash
|
||||
nv-ingest-cli \
|
||||
--doc ./data/test.pdf \
|
||||
--output_directory ./processed_docs \
|
||||
--task='split' \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
```
|
||||
|
||||
Submit a PDF file with splitting and extraction tasks.
|
||||
|
||||
**Note: (TODO)** This currently only works for pdfium, eclair, and Unstructured.io; haystack, Adobe, and LlamaParse
|
||||
have existing workflows but have not been fully converted to use our unified metadata schema.
|
||||
|
||||
```bash
|
||||
nv-ingest-cli \
|
||||
--doc ./data/test.pdf \
|
||||
--output_directory ./processed_docs \
|
||||
--task='extract:{"document_type": "pdf", "extract_method": "pdfium"}' \
|
||||
--task='extract:{"document_type": "docx", "extract_method": "python_docx"}' \
|
||||
--task='split' \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
|
||||
```
|
||||
|
||||
Submit a [dataset](#command-line-dataset-creation-with-enumeration-and-sampling) for processing
|
||||
|
||||
```shell
|
||||
nv-ingest-cli \
|
||||
--dataset dataset.json \
|
||||
--output_directory ./processed_docs \
|
||||
--task='extract:{"document_type": "pdf", "extract_method": "pdfium"}' \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
|
||||
```
|
||||
|
||||
Submit a PDF file with extraction tasks and upload extracted images to MinIO.
|
||||
|
||||
```bash
|
||||
nv-ingest-cli \
|
||||
--doc ./data/test.pdf \
|
||||
--output_directory ./processed_docs \
|
||||
--task='extract:{"document_type": "pdf", "extract_method": "pdfium"}' \
|
||||
--task='store:{"endpoint":"minio:9000","access_key":"minioadmin","secret_key":"minioadmin"}' \
|
||||
--client_host=localhost \
|
||||
--client_port=6379
|
||||
|
||||
```
|
||||
|
||||
### Command line dataset creation with enumeration and sampling
|
||||
|
||||
#### gen_dataset.py
|
||||
|
||||
```shell
|
||||
python ./src/util/gen_dataset.py --source_directory=./data --size=1GB --sample pdf=60 --sample txt=40 --output_file \
|
||||
dataset.json --validate-output
|
||||
```
|
||||
|
||||
This script samples files from a specified source directory according to defined proportions and a total size target. It
|
||||
offers options for caching the file list, outputting a sampled file list, and validating the output.
|
||||
|
||||
### Options
|
||||
|
||||
- `--source_directory`: Specifies the path to the source directory where files will be scanned for sampling.
|
||||
|
||||
- **Type**: String
|
||||
- **Required**: Yes
|
||||
- **Example**: `--source_directory ./data`
|
||||
|
||||
- `--size`: Defines the total size of files to sample. You can use suffixes (KB, MB, GB).
|
||||
|
||||
- **Type**: String
|
||||
- **Required**: Yes
|
||||
- **Example**: `--size 500MB`
|
||||
|
||||
- `--sample`: Specifies file types and their proportions of the total size. Can be used multiple times for different
|
||||
file types.
|
||||
|
||||
- **Type**: String
|
||||
- **Required**: No
|
||||
- **Multiple**: Yes
|
||||
- **Example**: `--sample pdf=40 --sample txt=60`
|
||||
|
||||
- `--cache_file`: If provided, caches the scanned file list as JSON at this path.
|
||||
|
||||
- **Type**: String
|
||||
- **Required**: No
|
||||
- **Example**: `--cache_file ./file_list_cache.json`
|
||||
|
||||
- `--output_file`: If provided, outputs the list of sampled files as JSON at this path.
|
||||
|
||||
- **Type**: String
|
||||
- **Required**: No
|
||||
- **Example**: `--output_file ./sampled_files.json`
|
||||
|
||||
- `--validate-output`: If set, the script re-validates the `output_file` JSON and logs total bytes for each file type.
|
||||
|
||||
- **Type**: Flag
|
||||
- **Required**: No
|
||||
|
||||
- `--log-level`: Sets the logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'). Default is 'INFO'.
|
||||
|
||||
- **Type**: Choice
|
||||
- **Required**: No
|
||||
- **Example**: `--log-level DEBUG`
|
||||
|
||||
- `--with-replacement`: Sample with replacement. Files can be selected multiple times.
|
||||
- **Type**: Flag
|
||||
- **Default**: True (if omitted, sampling will be with replacement)
|
||||
- **Usage Example**: `--with-replacement` to enable sampling with replacement or omit for default behavior.
|
||||
Use `--no-with-replacement` to disable it and sample without replacement.
|
||||
|
||||
The script performs a sampling process that respects the specified size and type proportions, generates a detailed file
|
||||
list, and provides options for caching and validation to facilitate efficient data handling and integrity checking.
|
||||
|
||||
### Command line interface for the Image Viewer application, displays paginated images from a JSON file
|
||||
|
||||
viewer. Each image is resized for uniform display, and users can navigate through the images using "Next" and "Previous"
|
||||
buttons.
|
||||
|
||||
#### image_viewer.py
|
||||
|
||||
- `--file_path`: Specifies the path to the JSON file containing the images. The JSON file should contain a list of
|
||||
objects, each with an `"image"` field that includes a base64 encoded string of the image data.
|
||||
- **Type**: String
|
||||
- **Required**: Yes
|
||||
- **Example Usage**:
|
||||
```
|
||||
--file_path "/path/to/your/images.json"
|
||||
```
|
||||
33
docs/telemetry.md
Normal file
33
docs/telemetry.md
Normal file
@@ -0,0 +1,33 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
-->
|
||||
|
||||
# Telemetry
|
||||
|
||||
## Docker compose
|
||||
|
||||
To run OpenTelemetry locally, run
|
||||
|
||||
```shell
|
||||
$ docker compose up otel-collector
|
||||
```
|
||||
|
||||
Once and OpenTelemetry and Zipkin are running, you can open your browser to explore traces: [http://localhost:9411/zipkin/](http://localhost:9411/zipkin/).
|
||||
|
||||

|
||||
|
||||
To run Prometheus, run
|
||||
|
||||
```shell
|
||||
$ docker compose up prometheus
|
||||
```
|
||||
|
||||
Once Promethus is running, you can open your browser to explore metrics: [http://localhost:9090/](http://localhost:9090/)
|
||||
|
||||

|
||||
|
||||
## Helm chart
|
||||
|
||||
TODO
|
||||
23
helm/.helmignore
Normal file
23
helm/.helmignore
Normal file
@@ -0,0 +1,23 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
10
helm/CHANGELOG.md
Normal file
10
helm/CHANGELOG.md
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
# CHANGELOG
|
||||
|
||||
## v0.2.24 - 15 Aug 2024
|
||||
|
||||
Update the handling of the default for the persistent volume name
|
||||
|
||||
## v0.2.23 - 15 Aug 2024
|
||||
|
||||
Update for otel env vars
|
||||
44
helm/Chart.yaml
Normal file
44
helm/Chart.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
apiVersion: v2
|
||||
name: nv-ingest
|
||||
description: NV-Ingest Microservice
|
||||
type: application
|
||||
version: 0.3.4
|
||||
maintainers:
|
||||
- name: NVIDIA Corporation
|
||||
url: https://www.nvidia.com/
|
||||
dependencies:
|
||||
- name: common
|
||||
repository: oci://registry-1.docker.io/bitnamicharts
|
||||
tags:
|
||||
- bitnami-common
|
||||
version: 2.x.x
|
||||
- name: redis
|
||||
repository: oci://registry-1.docker.io/bitnamicharts
|
||||
tags:
|
||||
- redis
|
||||
version: 19.1.3
|
||||
condition: redisDeployed
|
||||
- name: zipkin
|
||||
repository: https://zipkin.io/zipkin-helm
|
||||
version: 0.1.2
|
||||
condition: zipkinDeployed
|
||||
- name: opentelemetry-collector
|
||||
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||
version: 0.78.1
|
||||
condition: otelDeployed
|
||||
- name: yolox-nim
|
||||
repository: "alias:ngc"
|
||||
version: 0.1.5
|
||||
condition: yoloxDeployed
|
||||
- name: cached-nim
|
||||
repository: "alias:ngc"
|
||||
version: 0.1.5
|
||||
condition: cachedDeployed
|
||||
- name: paddleocr-nim
|
||||
repository: "alias:ngc"
|
||||
version: 0.1.5
|
||||
condition: paddleocrDeployed
|
||||
- name: deplot-nim
|
||||
repository: "alias:ngc"
|
||||
version: 0.1.5
|
||||
condition: deplotDeployed
|
||||
3
helm/LICENSE
Normal file
3
helm/LICENSE
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
238
helm/README.md
Normal file
238
helm/README.md
Normal file
@@ -0,0 +1,238 @@
|
||||
# NVIDIA NVIngest
|
||||
|
||||
## Setup Environment
|
||||
|
||||
- First create your namespace
|
||||
|
||||
```bash
|
||||
NAMESPACE=nv-ingest
|
||||
kubectl create namespace ${NAMESPACE}
|
||||
```
|
||||
|
||||
- Install the chart
|
||||
|
||||
```bash
|
||||
helm upgrade \
|
||||
--install \
|
||||
--username '$oauthtoken' \
|
||||
--password "${NGC_API_KEY}" \
|
||||
-n ${NAMESPACE} \
|
||||
nv-ingest \
|
||||
--set imagePullSecret.create=true \
|
||||
--set imagePullSecret.password="${NGC_API_KEY}" \
|
||||
--set ngcSecret.create=true \
|
||||
--set ngcSecret.password="${NGC_API_KEY}" \
|
||||
--set image.repository="#placeholder" \
|
||||
--set image.tag="24.08-rc2" \
|
||||
#placeholder
|
||||
|
||||
```
|
||||
|
||||
Optionally you can create your own versions of the `Secrets` if you do not want to use the creation via the helm chart.
|
||||
|
||||
|
||||
```bash
|
||||
|
||||
NAMESPACE=nvidia-nims
|
||||
DOCKER_CONFIG='{"auths":{"nvcr.io":{"username":"$oauthtoken", "password":"'${NGC_API_KEY}'" }}}'
|
||||
echo -n $DOCKER_CONFIG | base64 -w0
|
||||
NGC_REGISTRY_PASSWORD=$(echo -n $DOCKER_CONFIG | base64 -w0 )
|
||||
|
||||
kubectl apply -n ${NAMESPACE} -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: nvcrimagepullsecret
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
.dockerconfigjson: ${NGC_REGISTRY_PASSWORD}
|
||||
EOF
|
||||
kubectl create -n ${NAMESPACE} secret generic ngc-api --from-literal=NGC_API_KEY=${NGC_API_KEY}
|
||||
|
||||
```
|
||||
|
||||
You can also use an External Secret Store like Vault, the name of the secret name expected for the NGC API is `ngc-api` and the secret name expected for NVCR is `nvcrimagepullsecret`.
|
||||
|
||||
## Parameters
|
||||
|
||||
### Deployment parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `affinity` | [default: {}] Affinity settings for deployment. | `{}` |
|
||||
| `nodeSelector` | Sets node selectors for the NIM -- for example `nvidia.com/gpu.present: "true"` | `{}` |
|
||||
| `logLevel` | Log level of NVIngest service. Possible values of the variable are TRACE, DEBUG, INFO, WARNING, ERROR, CRITICAL. | `DEBUG` |
|
||||
| `extraEnvVarsCM` | [default: ""] A Config map holding Enviroment variables to include in the NVIngest containerextraEnvVarsCM: "" | `""` |
|
||||
| `extraEnvVarsSecret` | [default: ""] A K8S Secret to map to Enviroment variables to include in the NVIngest container | `""` |
|
||||
| `fullnameOverride` | [default: ""] A name to force the fullname of the NVIngest container to have, defaults to the Helm Release Name | `""` |
|
||||
| `nameOverride` | [default: ""] A name to base the objects created by this helm chart | `""` |
|
||||
| `image.repository` | NIM Image Repository | `""` |
|
||||
| `image.tag` | Image tag or version | `""` |
|
||||
| `image.pullPolicy` | Image pull policy | `""` |
|
||||
| `podAnnotations` | Sets additional annotations on the main deployment pods | `{}` |
|
||||
| `podLabels` | Specify extra labels to be add to on deployed pods. | `{}` |
|
||||
| `podSecurityContext` | Specify privilege and access control settings for pod | |
|
||||
| `podSecurityContext.fsGroup` | Specify file system owner group id. | `1000` |
|
||||
| `extraVolumes` | Adds arbitrary additional volumes to the deployment set definition | `{}` |
|
||||
| `extraVolumeMounts` | Specify volume mounts to the main container from `extraVolumes` | `{}` |
|
||||
| `imagePullSecrets` | Specify list of secret names that are needed for the main container and any init containers. | |
|
||||
| `containerSecurityContext` | Sets privilege and access control settings for container (Only affects the main container, not pod-level) | `{}` |
|
||||
| `tolerations` | Specify tolerations for pod assignment. Allows the scheduler to schedule pods with matching taints. | |
|
||||
| `replicaCount` | The number of replicas for NVIngest when autoscaling is disabled | `1` |
|
||||
| `resources.limits."nvidia.com/gpu"` | Specify number of GPUs to present to the running service. | |
|
||||
| `resources.limits.memory` | Specify limit for memory | `32Gi` |
|
||||
| `resources.requests.memory` | Specify request for memory | `16Gi` |
|
||||
| `tmpDirSize` | Specify the amount of space to reserve for temporary storage | `8Gi` |
|
||||
|
||||
### NIM Configuration
|
||||
|
||||
Define additional values to the dependent NIM helm charts by updating the "yolox-nim", "cached-nim", "deplot-nim", and "paddleocr-nim"
|
||||
values. A sane set of configurations are already included in this value file and only the "image.repository" and "image.tag" fields are
|
||||
explicitly called out here.
|
||||
|
||||
| Name | Description | Value |
|
||||
| -------------------------------- | --------------------------------------------------------------- | ----- |
|
||||
| `yolox-nim.image.repository` | The repository to override the location of the YOLOX | |
|
||||
| `yolox-nim.image.tag` | The tag override for YOLOX | |
|
||||
| `cached-nim.image.repository` | The repository to override the location of the Cached Model NIM | |
|
||||
| `cached-nim.image.tag` | The tag override for Cached Model NIM | |
|
||||
| `paddleocr-nim.image.repository` | The repository to override the location of the Paddle OCR NIM | |
|
||||
| `paddleocr-nim.image.tag` | The tag override for Paddle OCR NIM | |
|
||||
| `deplot-nim.image.repository` | The repository to override the location of the Deplot NIM | |
|
||||
| `deplot-nim.image.tag` | The tag override for Deplot NIM | |
|
||||
|
||||
### Autoscaling parameters
|
||||
|
||||
Values used for creating a `Horizontal Pod Autoscaler`. If autoscaling is not enabled, the rest are ignored.
|
||||
NVIDIA recommends usage of the custom metrics API, commonly implemented with the prometheus-adapter.
|
||||
Standard metrics of CPU and memory are of limited use in scaling NIM.
|
||||
|
||||
| Name | Description | Value |
|
||||
| ------------------------- | ----------------------------------------- | ------- |
|
||||
| `autoscaling.enabled` | Enables horizontal pod autoscaler. | `false` |
|
||||
| `autoscaling.minReplicas` | Specify minimum replicas for autoscaling. | `1` |
|
||||
| `autoscaling.maxReplicas` | Specify maximum replicas for autoscaling. | `100` |
|
||||
| `autoscaling.metrics` | Array of metrics for autoscaling. | `[]` |
|
||||
|
||||
### Redis configurations
|
||||
|
||||
Include any redis configuration that you'd like with the deployed Redis
|
||||
Find values at https://github.com/bitnami/charts/tree/main/bitnami/redis
|
||||
|
||||
| Name | Description | Value |
|
||||
| --------------- | ------------------------------------------------------------------------ | --------- |
|
||||
| `redisDeployed` | Whether to deploy Redis from this helm chart | `true` |
|
||||
| `redis` | Find values at https://github.com/bitnami/charts/tree/main/bitnami/redis | `sane {}` |
|
||||
|
||||
### Environment Variables
|
||||
|
||||
Define environment variables as key/value dictionary pairs
|
||||
|
||||
| Name | Description | Value |
|
||||
| -------------------------------------- | --------------------------------------------------------------------------------------------------------- | -------------------------- |
|
||||
| `envVars` | Adds arbitrary environment variables to the main container using key-value pairs, for example NAME: value | `sane {}` |
|
||||
| `envVars.MESSAGE_CLIENT_HOST` | Override this value if disabling Redis deployment in this chart. | `"nv-ingest-redis-master"` |
|
||||
| `envVars.MESSAGE_CLIENT_PORT` | Override this value if disabling Redis deployment in this chart. | `"6379"` |
|
||||
| `envVars.NV_INGEST_DEFAULT_TIMEOUT_MS` | Override the Timeout of the NVIngest requests. | `"1234"` |
|
||||
|
||||
### Open Telemetry
|
||||
|
||||
Define environment variables as key/value dictionary pairs for configuring OTEL Deployments
|
||||
A sane set of parameters is set for the deployed version of OpenTelemetry with this Helm Chart.
|
||||
Override any values to the Open Telemetry helm chart by overriding the `open-telemetry` value.
|
||||
|
||||
| Name | Description | Value |
|
||||
| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- |
|
||||
| `otelEnabled` | Whether to enable OTEL collection | `false` |
|
||||
| `otelDeployed` | Whether to deploy OTEL from this helm chart | `false` |
|
||||
| `otelEnvVars` | Adds arbitrary environment variables for configuring OTEL using key-value pairs, for example NAME: value | `sane {}` |
|
||||
| `otelEnvVars.OTEL_EXPORTER_OTLP_ENDPOINT` | | `"http://$(HOST_IP):4317" # sends to gRPC receiver on port 4317` |
|
||||
| `otelEnvVars.OTEL_SERVICE_NAME` | | `"nemo-retrieval-service"` |
|
||||
| `otelEnvVars.OTEL_TRACES_EXPORTER` | | `"otlp"` |
|
||||
| `otelEnvVars.OTEL_METRICS_EXPORTER` | | `"otlp"` |
|
||||
| `otelEnvVars.OTEL_LOGS_EXPORTER` | | `"none"` |
|
||||
| `otelEnvVars.OTEL_PROPAGATORS` | | `"tracecontext baggage"` |
|
||||
| `otelEnvVars.OTEL_RESOURCE_ATTRIBUTES` | | `"deployment.environment=$(NAMESPACE)"` |
|
||||
| `otelEnvVars.OTEL_PYTHON_EXCLUDED_URLS` | | `"health"` |
|
||||
| `opentelemetry-collector` | Configures the opentelemetry helm chart - see https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/values.yaml | |
|
||||
| `zipkinDeployed` | Whether to deploy Zipkin with OpenTelemetry from this helm chart | `false` |
|
||||
|
||||
### Ingress parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ------------------------------------ | ----------------------------------------------------- | ------------------------ |
|
||||
| `ingress.enabled` | Enables ingress. | `false` |
|
||||
| `ingress.className` | Specify class name for Ingress. | `""` |
|
||||
| `ingress.annotations` | Specify additional annotations for ingress. | `{}` |
|
||||
| `ingress.hosts` | Specify list of hosts each containing lists of paths. | |
|
||||
| `ingress.hosts[0].host` | Specify name of host. | `chart-example.local` |
|
||||
| `ingress.hosts[0].paths[0].path` | Specify ingress path. | `/` |
|
||||
| `ingress.hosts[0].paths[0].pathType` | Specify path type. | `ImplementationSpecific` |
|
||||
| `ingress.tls` | Specify list of pairs of TLS `secretName` and hosts. | `[]` |
|
||||
|
||||
### Probe parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ----------------------------------- | ----------------------------------------- | --------- |
|
||||
| `livenessProbe.enabled` | Enables `livenessProbe`` | `false` |
|
||||
| `livenessProbe.httpGet.path` | `LivenessProbe`` endpoint path | `/health` |
|
||||
| `livenessProbe.httpGet.port` | `LivenessProbe`` endpoint port | `http` |
|
||||
| `livenessProbe.initialDelaySeconds` | Initial delay seconds for `livenessProbe` | `120` |
|
||||
| `livenessProbe.timeoutSeconds` | Timeout seconds for `livenessProbe` | `20` |
|
||||
| `livenessProbe.periodSeconds` | Period seconds for `livenessProbe` | `10` |
|
||||
| `livenessProbe.successThreshold` | Success threshold for `livenessProbe` | `1` |
|
||||
| `livenessProbe.failureThreshold` | Failure threshold for `livenessProbe` | `20` |
|
||||
|
||||
### Probe parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ---------------------------------- | ---------------------------------------- | --------- |
|
||||
| `startupProbe.enabled` | Enables `startupProbe`` | `false` |
|
||||
| `startupProbe.httpGet.path` | `StartupProbe`` endpoint path | `/health` |
|
||||
| `startupProbe.httpGet.port` | `StartupProbe`` endpoint port | `http` |
|
||||
| `startupProbe.initialDelaySeconds` | Initial delay seconds for `startupProbe` | `120` |
|
||||
| `startupProbe.timeoutSeconds` | Timeout seconds for `startupProbe` | `10` |
|
||||
| `startupProbe.periodSeconds` | Period seconds for `startupProbe` | `30` |
|
||||
| `startupProbe.successThreshold` | Success threshold for `startupProbe` | `1` |
|
||||
| `startupProbe.failureThreshold` | Failure threshold for `startupProbe` | `220` |
|
||||
|
||||
### Probe parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ------------------------------------ | ------------------------------------------ | --------- |
|
||||
| `readinessProbe.enabled` | Enables `readinessProbe`` | `false` |
|
||||
| `readinessProbe.httpGet.path` | `ReadinessProbe`` endpoint path | `/health` |
|
||||
| `readinessProbe.httpGet.port` | `ReadinessProbe`` endpoint port | `http` |
|
||||
| `readinessProbe.initialDelaySeconds` | Initial delay seconds for `readinessProbe` | `120` |
|
||||
| `readinessProbe.timeoutSeconds` | Timeout seconds for `readinessProbe` | `10` |
|
||||
| `readinessProbe.periodSeconds` | Period seconds for `readinessProbe` | `30` |
|
||||
| `readinessProbe.successThreshold` | Success threshold for `readinessProbe` | `1` |
|
||||
| `readinessProbe.failureThreshold` | Failure threshold for `readinessProbe` | `220` |
|
||||
|
||||
### Service parameters
|
||||
|
||||
| Name | Description | Value |
|
||||
| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||
| `service.type` | Specifies the service type for the deployment. | `ClusterIP` |
|
||||
| `service.name` | Overrides the default service name | `""` |
|
||||
| `service.port` | Specifies the HTTP Port for the service. | `8000` |
|
||||
| `service.nodePort` | Specifies an optional HTTP Node Port for the service. | `nil` |
|
||||
| `service.annotations` | Specify additional annotations to be added to service. | `{}` |
|
||||
| `service.labels` | Specifies additional labels to be added to service. | `{}` |
|
||||
| `serviceAccount` | Options to specify service account for the deployment. | |
|
||||
| `serviceAccount.create` | Specifies whether a service account should be created. | `true` |
|
||||
| `serviceAccount.annotations` | Sets annotations to be added to the service account. | `{}` |
|
||||
| `serviceAccount.name` | Specifies the name of the service account to use. If it is not set and create is "true", a name is generated using a "fullname" template. | `""` |
|
||||
|
||||
### Secret Creation
|
||||
|
||||
Manage the creation of secrets used by the helm chart
|
||||
|
||||
| Name | Description | Value |
|
||||
| -------------------------- | ------------------------------------------------------ | ------- |
|
||||
| `ngcSecret.create` | Specifies whether to create the ngc api secret | `false` |
|
||||
| `ngcSecret.password` | The password to use for the NGC Secret | `""` |
|
||||
| `imagePullSecret.create` | Specifies whether to create the NVCR Image Pull secret | `false` |
|
||||
| `imagePullSecret.password` | The password to use for the NVCR Image Pull Secret | `""` |
|
||||
|
||||
|
||||
68
helm/files/ngc_pull.sh
Executable file
68
helm/files/ngc_pull.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# To ensure we actually have an NGC binary, switch to full path if default is used
|
||||
if [ "$NGC_EXE" = "ngc" ]; then
|
||||
NGC_EXE=$(which ngc)
|
||||
fi
|
||||
|
||||
# check if ngc cli is truly available at this point
|
||||
if [ ! -x "$NGC_EXE" ]; then
|
||||
echo "ngc cli is not installed or available!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# download the model
|
||||
directory="${STORE_MOUNT_PATH}/${NGC_MODEL_NAME}_v${NGC_MODEL_VERSION}"
|
||||
echo "Directory is $directory"
|
||||
ready_file="$directory/.ready"
|
||||
lock_file="$directory/.lock"
|
||||
|
||||
mkdir -p "$directory"
|
||||
exec 200>"$lock_file"
|
||||
{
|
||||
if ln -s "$lock_file" "$lock_file.locked"; then
|
||||
trap 'rm -f $lock_file.locked' EXIT
|
||||
if [ ! -e "$ready_file" ]; then
|
||||
$NGC_EXE registry model download-version --dest "$STORE_MOUNT_PATH" "${NGC_CLI_ORG}/${NGC_CLI_TEAM}/${NGC_MODEL_NAME}:${NGC_MODEL_VERSION}"
|
||||
# decrypt the model - if needed (conditions met)
|
||||
if [ -n "${NGC_DECRYPT_KEY:+''}" ] && [ -f "$directory/${MODEL_NAME}.enc" ]; then
|
||||
echo "Decrypting $directory/${MODEL_NAME}.enc"
|
||||
# untar if necessary
|
||||
if [ -n "${TARFILE:+''}" ]; then
|
||||
echo "TARFILE enabled, unarchiving..."
|
||||
openssl enc -aes-256-cbc -d -pbkdf2 -in "$directory/${MODEL_NAME}.enc" -out "$directory/${MODEL_NAME}.tar" -k "${NGC_DECRYPT_KEY}"
|
||||
tar -xvf "$directory/${MODEL_NAME}.tar" -C "$STORE_MOUNT_PATH"
|
||||
rm "$directory/${MODEL_NAME}.tar"
|
||||
else
|
||||
openssl enc -aes-256-cbc -d -pbkdf2 -in "$directory/${MODEL_NAME}.enc" -out "$directory/${MODEL_NAME}" -k "${NGC_DECRYPT_KEY}"
|
||||
fi
|
||||
rm "$directory/${MODEL_NAME}.enc"
|
||||
else
|
||||
echo "No decryption key provided, or encrypted file found. Skipping decryption.";
|
||||
if [ -n "${TARFILE:+''}" ]; then
|
||||
echo "TARFILE enabled, unarchiving..."
|
||||
tar -xvf "$directory/${NGC_MODEL_VERSION}.tar.gz" -C "$STORE_MOUNT_PATH"
|
||||
rm "$directory/${NGC_MODEL_VERSION}.tar.gz"
|
||||
fi
|
||||
fi
|
||||
touch "$ready_file"
|
||||
echo "Done dowloading"
|
||||
else
|
||||
echo "Download was already complete"
|
||||
fi;
|
||||
rm -f "$lock_file.locked"
|
||||
else
|
||||
while [ ! -e "$ready_file" ]
|
||||
do
|
||||
echo "Did not get the download lock. Waiting for the pod holding the lock to download the files."
|
||||
sleep 1
|
||||
done;
|
||||
echo "Done waiting"
|
||||
fi
|
||||
}
|
||||
ls -la "$directory"
|
||||
17
helm/templates/NOTES.txt
Normal file
17
helm/templates/NOTES.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
Installed {{ .Chart.Name }}-{{ .Chart.Version }}, named {{ .Release.Name }}.
|
||||
Visit the application via:
|
||||
{{- if .Values.ingress.enabled }}
|
||||
{{- range $host := .Values.ingress.hosts }}
|
||||
{{- range .paths }}
|
||||
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- else if and .Values.virtualService .Values.virtualService.enabled }}
|
||||
https://{{ .Values.virtualService.dnsName }}
|
||||
{{- end }}
|
||||
|
||||
To learn more about the release, try:
|
||||
|
||||
$ helm status {{ .Release.Name }}
|
||||
$ helm get {{ .Release.Name }}
|
||||
$ helm test {{ .Release.Name }}
|
||||
396
helm/templates/_helpers.tpl
Normal file
396
helm/templates/_helpers.tpl
Normal file
@@ -0,0 +1,396 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "nv-ingest.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "nv-ingest.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "nv-ingest.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "nv-ingest.labels" -}}
|
||||
helm.sh/chart: {{ include "nv-ingest.chart" . }}
|
||||
{{ include "nv-ingest.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "nv-ingest.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "nv-ingest.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "nv-ingest.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create }}
|
||||
{{- default (include "nv-ingest.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else }}
|
||||
{{- default "default" .Values.serviceAccount.name }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create secret to access docker registry
|
||||
*/}}
|
||||
{{- define "nv-ingest.imagePullSecret" }}
|
||||
{{- printf "{\"auths\": {\"%s\": {\"auth\": \"%s\"}}}" .Values.imagePullSecret.registry (printf "%s:%s" .Values.imagePullSecret.username .Values.imagePullSecret.password | b64enc) | b64enc }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create secret to access docker registry
|
||||
*/}}
|
||||
{{- define "nv-ingest.ngcAPIKey" }}
|
||||
{{- printf "%s" .Values.ngcSecret.password }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{/*
|
||||
Create a deployment with the NGC Puller
|
||||
*/}}
|
||||
{{- define "nv-ingest.triton-statefulset" }}
|
||||
|
||||
{{ $tritonCommon := .Values.triton.common }}
|
||||
{{ $root := .Values }}
|
||||
|
||||
{{- range $tName, $triton := .Values.triton.instances }}
|
||||
|
||||
{{- $pvcUsingTemplate := and $triton.persistence.enabled (not $triton.persistence.existingClaim) (ne $triton.persistence.accessMode "ReadWriteMany") | ternary true false }}
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: "{{ $.Release.Name }}-{{ $triton.service.name }}"
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" $ | nindent 4 }}
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
{{- if $tritonCommon.http_port }}
|
||||
- port: {{ $tritonCommon.http_port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
{{- end }}
|
||||
{{- if $tritonCommon.grpc_port }}
|
||||
- port: {{ $tritonCommon.grpc_port }}
|
||||
targetPort: grpc
|
||||
protocol: TCP
|
||||
name: grpc
|
||||
{{- end }}
|
||||
{{- if $tritonCommon.openai_port }}
|
||||
- port: {{ $tritonCommon.openai_port }}
|
||||
targetPort: http-openai
|
||||
name: http-openai
|
||||
{{- end }}
|
||||
{{- if $tritonCommon.nemo_port }}
|
||||
- port: {{ $tritonCommon.nemo_port }}
|
||||
targetPort: http-nemo
|
||||
name: http-nemo
|
||||
{{- end }}
|
||||
{{- if and $tritonCommon.metrics.enabled $tritonCommon.metrics.port }}
|
||||
- port: {{ $tritonCommon.metrics.port }}
|
||||
targetPort: metrics
|
||||
name: metrics
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "nv-ingest.selectorLabels" $ | nindent 4 }}
|
||||
|
||||
---
|
||||
{{- if and $triton.persistence.enabled (not $pvcUsingTemplate) (not $triton.persistence.existingClaim )}}
|
||||
kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: "{{ include "nv-ingest.fullname" $ }}-{{ $tName }}"
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" $ | nindent 4 }}
|
||||
{{- with $triton.persistence.annotations }}
|
||||
annotations:
|
||||
{{ toYaml . | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
accessModes:
|
||||
- {{ $triton.persistence.accessMode | quote }}
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ $triton.persistence.size | quote }}
|
||||
{{- if $triton.persistence.storageClass }}
|
||||
storageClassName: "{{ $triton.persistence.storageClass }}"
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: "{{ $.Release.Name }}-{{ $tName }}"
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" $ | nindent 4 }}
|
||||
spec:
|
||||
podManagementPolicy: "Parallel"
|
||||
{{- if not $root.autoscaling.enabled }}
|
||||
replicas: {{ $triton.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "nv-ingest.selectorLabels" $ | nindent 6 }}
|
||||
serviceName: {{ include "nv-ingest.fullname" $ }}
|
||||
template:
|
||||
metadata:
|
||||
{{- with $root.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "nv-ingest.selectorLabels" $ | nindent 8 }}
|
||||
spec:
|
||||
{{- with $root.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "nv-ingest.serviceAccountName" $ }}
|
||||
securityContext:
|
||||
{{- toYaml $root.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
|
||||
{{- range $name, $model := $triton.models }}
|
||||
- name: ngc-model-puller-{{ $name }}
|
||||
image: "{{ $root.image.repository }}:{{ $root.image.tag }}"
|
||||
command:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
args:
|
||||
- apt-get update && apt-get install --yes wget && wget "https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.34.1/files/ngccli_linux.zip" -O "/workspace/ngccli_linux.zip" && unzip /workspace/ngccli_linux.zip && chmod u+x ngc-cli/ngc && export PATH="$PATH:/workspace/ngc-cli" && /scripts/ngc_pull.sh
|
||||
env:
|
||||
- name: NGC_CLI_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "ngc-api"
|
||||
key: NGC_API_KEY
|
||||
- name: NGC_DECRYPT_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "ngc-api"
|
||||
key: NGC_DECRYPT_KEY
|
||||
optional: true
|
||||
- name: STORE_MOUNT_PATH
|
||||
value: "/model-store"
|
||||
- name: NGC_CLI_ORG
|
||||
value: {{ $model.NGC_CLI_ORG | quote }}
|
||||
- name: NGC_CLI_TEAM
|
||||
value: {{ $model.NGC_CLI_TEAM | quote }}
|
||||
- name: NGC_CLI_VERSION
|
||||
value: {{ $model.NGC_CLI_VERSION | quote }}
|
||||
- name: NGC_MODEL_NAME
|
||||
value: {{ $model.NGC_MODEL_NAME | quote }}
|
||||
- name: NGC_MODEL_VERSION
|
||||
value: {{ $model.NGC_MODEL_VERSION | quote }}
|
||||
- name: MODEL_NAME
|
||||
value: {{ $model.MODEL_NAME | quote }}
|
||||
- name: TARFILE
|
||||
value: {{ $model.TARFILE | quote }}
|
||||
- name: NGC_EXE
|
||||
value: "ngc"
|
||||
volumeMounts:
|
||||
- mountPath: /model-store
|
||||
name: model-store-{{ $tName }}
|
||||
subPath: {{ $tName }}
|
||||
- name: scripts-volume
|
||||
mountPath: /scripts
|
||||
{{- end }}
|
||||
- name: clean-up
|
||||
image: ubuntu:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
rm -rf /model-store/nemo-retriever-* &&
|
||||
rm -rf /model-store/nemo-retrieval-*
|
||||
volumeMounts:
|
||||
- mountPath: /model-store
|
||||
subPath: {{ $tName }}
|
||||
name: model-store-{{ $tName }}
|
||||
|
||||
containers:
|
||||
- name: {{ $tName }}
|
||||
securityContext:
|
||||
{{- toYaml $root.containerSecurityContext | nindent 12 }}
|
||||
image: "{{ $triton.image.repository }}:{{ $triton.image.tag }}"
|
||||
imagePullPolicy: {{ $root.image.pullPolicy }}
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- "tritonserver --model-repository=/model-store/ --log-verbose=1 || sleep 120"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
- containerPort: {{ $tritonCommon.health_port }}
|
||||
name: health
|
||||
- containerPort: {{ $tritonCommon.grpc_port }}
|
||||
name: grpc
|
||||
{{- if $tritonCommon.metrics.enabled }}
|
||||
- containerPort: {{ $tritonCommon.metrics.port }}
|
||||
name: metrics
|
||||
{{- end }}
|
||||
- containerPort: {{ $tritonCommon.openai_port }}
|
||||
name: http-openai
|
||||
- containerPort: {{ $tritonCommon.nemo_port }}
|
||||
name: http-nemo
|
||||
{{- if $tritonCommon.livenessProbe.enabled }}
|
||||
{{- with $tritonCommon.livenessProbe }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: {{ .path }}
|
||||
port: {{ .port }}
|
||||
initialDelaySeconds: {{ .initialDelaySeconds }}
|
||||
periodSeconds: {{ .periodSeconds }}
|
||||
timeoutSeconds: {{ .timeoutSeconds }}
|
||||
successThreshold: {{ .successThreshold }}
|
||||
failureThreshold: {{ .failureThreshold }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if $tritonCommon.readinessProbe.enabled }}
|
||||
{{- with $tritonCommon.readinessProbe }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: {{ .path }}
|
||||
port: {{ .port }}
|
||||
initialDelaySeconds: {{ .initialDelaySeconds }}
|
||||
periodSeconds: {{ .periodSeconds }}
|
||||
timeoutSeconds: {{ .timeoutSeconds }}
|
||||
successThreshold: {{ .successThreshold }}
|
||||
failureThreshold: {{ .failureThreshold }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if $tritonCommon.startupProbe.enabled }}
|
||||
{{- with $tritonCommon.startupProbe }}
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: {{ .path }}
|
||||
port: {{ .port }}
|
||||
initialDelaySeconds: {{ .initialDelaySeconds }}
|
||||
periodSeconds: {{ .periodSeconds }}
|
||||
timeoutSeconds: {{ .timeoutSeconds }}
|
||||
successThreshold: {{ .successThreshold }}
|
||||
failureThreshold: {{ .failureThreshold }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
resources:
|
||||
{{- toYaml $triton.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: model-store-{{ $tName }}
|
||||
mountPath: /model-store
|
||||
subPath: {{ $tName }}
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
{{- if $root.extraVolumeMounts }}
|
||||
{{- range $k, $v := $root.extraVolumeMounts }}
|
||||
- name: {{ $k }}
|
||||
{{- toYaml $v | nindent 14 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
terminationGracePeriodSeconds: 60
|
||||
{{- with $root.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with $root.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with $root.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: scripts-volume
|
||||
configMap:
|
||||
name: {{ $.Release.Name }}-scripts-configmap
|
||||
defaultMode: 0555
|
||||
{{- if not $pvcUsingTemplate }}
|
||||
- name: model-store-{{ $tName }}
|
||||
{{- if $triton.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
{{- if $triton.persistence.existingClaim }}
|
||||
claimName: {{ $triton.persistence.existingClaim }}
|
||||
{{- else }}
|
||||
claimName: "{{ include "nv-ingest.fullname" $ }}-{{ $tName }}"
|
||||
{{- end }}
|
||||
{{- else if $triton.hostPath.enabled }}
|
||||
hostPath:
|
||||
path: {{ $triton.hostPath.path }}
|
||||
type: DirectoryOrCreate
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if $root.extraVolumes }}
|
||||
{{- range $k, $v := $root.extraVolumes }}
|
||||
- name: {{ $k }}
|
||||
{{- toYaml $v | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if $pvcUsingTemplate }}
|
||||
{{- with $triton.stsPersistentVolumeClaimRetentionPolicy }}
|
||||
persistentVolumeClaimRetentionPolicy:
|
||||
whenDeleted: {{ .whenDeleted }}
|
||||
whenScaled: {{ .whenScaled }}
|
||||
{{- end }}
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: model-store-{{ $tName }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" $ | nindent 8 }}
|
||||
{{- with $triton.persistence.annotations }}
|
||||
annotations:
|
||||
{{ toYaml . | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
accessModes:
|
||||
- {{ $triton.persistence.accessMode | quote }}
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ $triton.persistence.size | quote }}
|
||||
{{- if $triton.persistence.storageClass }}
|
||||
storageClassName: "{{ $triton.persistence.storageClass }}"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
11
helm/templates/config-map.yaml
Normal file
11
helm/templates/config-map.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-scripts-configmap
|
||||
data:
|
||||
ngc_pull.sh: |-
|
||||
{{ .Files.Get "files/ngc_pull.sh" | indent 4 }}
|
||||
8
helm/templates/configmap.yaml
Normal file
8
helm/templates/configmap.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
{{- if not .Values.extraEnvVarsCM }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "nv-ingest.fullname" . }}
|
||||
data:
|
||||
{{- end }}
|
||||
177
helm/templates/deployment.yaml
Normal file
177
helm/templates/deployment.yaml
Normal file
@@ -0,0 +1,177 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "nv-ingest.fullname" . }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.autoscaling.enabled }}
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "nv-ingest.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
{{- with .Values.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 8 }}
|
||||
{{- with .Values.podLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- with .Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "nv-ingest.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
|
||||
initContainers:
|
||||
- name: verify-tmpdir-permissions
|
||||
image: busybox
|
||||
command:
|
||||
- "sh"
|
||||
- "-c"
|
||||
- "mkdir -p /scratch/.cache /scratch/tmp && chown -R {{ .Values.nemo.userID }}:{{ .Values.nemo.groupID }} /scratch"
|
||||
volumeMounts:
|
||||
- name: ephemeral
|
||||
subPath: scratch
|
||||
mountPath: /scratch
|
||||
|
||||
containers:
|
||||
- name: {{ .Chart.Name }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.containerSecurityContext | nindent 12 }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
env:
|
||||
- name: HOST_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.hostIP
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: NGC_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: ngc-api
|
||||
key: NGC_API_KEY
|
||||
|
||||
- name: HF_HOME
|
||||
value: /scratch/.cache
|
||||
- name: TMPDIR
|
||||
value: /scratch/tmp
|
||||
|
||||
{{- if .Values.envVars }}
|
||||
{{- range $k, $v := .Values.envVars }}
|
||||
- name: "{{ $k }}"
|
||||
value: "{{ $v }}"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
# OpenTelemetry
|
||||
{{- if .Values.otelEnabled }}
|
||||
{{- range $k, $v := .Values.otelEnvVars }}
|
||||
- name: "{{ $k }}"
|
||||
value: "{{ $v }}"
|
||||
{{- end }}
|
||||
{{- if and .Values.otelEnabled ( not ( index .Values.otelEnvVars "OTEL_EXPORTER_OTLP_ENDPOINT" ) ) }}
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://{{ .Release.Name }}-opentelemetry-collector:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_PROTOCOL
|
||||
value: http/protobuf
|
||||
{{- end }}
|
||||
- name: CONSOLE_LOG_LEVEL
|
||||
value: "{{ .Values.logLevel }}"
|
||||
- name: OTEL_LOG_LEVEL
|
||||
value: "{{ .Values.logLevel }}"
|
||||
- name: LOG_LEVEL
|
||||
value: "{{ .Values.logLevel }}"
|
||||
{{- else }}
|
||||
- name: OTEL_TRACES_EXPORTER
|
||||
value: none
|
||||
- name: OTEL_METRICS_EXPORTER
|
||||
value: none
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: none
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "false"
|
||||
- name: LOG_HANDLERS
|
||||
value: console
|
||||
- name: CONSOLE_LOG_LEVEL
|
||||
value: "{{ .Values.logLevel }}"
|
||||
- name: LOG_LEVEL
|
||||
value: "{{ .Values.logLevel }}"
|
||||
{{- end }}
|
||||
envFrom:
|
||||
{{- if .Values.extraEnvVarsCM }}
|
||||
- configMapRef:
|
||||
name: {{ include "common.tplvalues.render" (dict "value" .Values.extraEnvVarsCM "context" $) }}
|
||||
{{ else }}
|
||||
- configMapRef:
|
||||
name: {{ include "nv-ingest.fullname" . }}
|
||||
{{- end }}
|
||||
{{- if .Values.extraEnvVarsSecret }}
|
||||
- secretRef:
|
||||
name: {{ include "common.tplvalues.render" (dict "value" .Values.extraEnvVarsSecret "context" $) }}
|
||||
{{- end }}
|
||||
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: {{ .Values.service.port }}
|
||||
protocol: TCP
|
||||
{{- if .Values.livenessProbe.enabled }}
|
||||
livenessProbe:
|
||||
- toYaml .Values.livenessProbe | nindent 12
|
||||
{{- end }}
|
||||
{{- if .Values.readinessProbe.enabled }}
|
||||
readinessProbe:
|
||||
- toYaml .Values.readinessProbe | nindent 12
|
||||
{{- end }}
|
||||
resources:
|
||||
{{- toYaml .Values.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: ephemeral
|
||||
subPath: scratch
|
||||
mountPath: /scratch
|
||||
{{- if .Values.extraVolumeMounts }}
|
||||
{{- range $k, $v := .Values.extraVolumeMounts }}
|
||||
- name: {{ $k }}
|
||||
{{- toYaml $v | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: ephemeral
|
||||
emptyDir:
|
||||
sizeLimit: "{{ .Values.tmpDirSize }}"
|
||||
{{- if .Values.extraVolumes }}
|
||||
{{- range $k, $v := .Values.extraVolumes }}
|
||||
- name: {{ $k }}
|
||||
{{- toYaml $v | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
20
helm/templates/hpa.yaml
Normal file
20
helm/templates/hpa.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
{{- if .Values.autoscaling.enabled }}
|
||||
---
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "nv-ingest.fullname" . }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 4 }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: {{ include "nv-ingest.fullname" . }}
|
||||
minReplicas: {{ .Values.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
{{- range .Values.autoscaling.metrics }}
|
||||
- {{- . | toYaml | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
26
helm/templates/image-pull-secret.yaml
Normal file
26
helm/templates/image-pull-secret.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
{{ if and .Values.ngcSecret.create -}}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: ngc-api
|
||||
type: Opaque
|
||||
data:
|
||||
NGC_CLI_API_KEY: {{ template "nv-ingest.ngcAPIKey" . }}
|
||||
NGC_API_KEY: {{ template "nv-ingest.ngcAPIKey" . }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{ if and .Values.imagePullSecret.name .Values.imagePullSecret.create -}}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ .Values.imagePullSecret.name }}
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
.dockerconfigjson: {{ template "nv-ingest.imagePullSecret" . }}
|
||||
{{- end }}
|
||||
62
helm/templates/ingress.yaml
Normal file
62
helm/templates/ingress.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
---
|
||||
{{- if .Values.ingress.enabled -}}
|
||||
{{- $fullName := include "nv-ingest.fullname" . -}}
|
||||
{{- $svcPort := .Values.service.port -}}
|
||||
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
|
||||
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
|
||||
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1beta1
|
||||
{{- else -}}
|
||||
apiVersion: extensions/v1beta1
|
||||
{{- end }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ $fullName }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 4 }}
|
||||
{{- with .Values.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- range .Values.ingress.hosts }}
|
||||
- host: {{ .host | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- range .paths }}
|
||||
- path: {{ .path }}
|
||||
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
|
||||
pathType: {{ .pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
number: {{ $svcPort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $fullName }}
|
||||
servicePort: {{ $svcPort }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
30
helm/templates/service.yaml
Normal file
30
helm/templates/service.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ .Values.service.name | default (include "nv-ingest.fullname" .) }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 4 }}
|
||||
{{- if .Values.service.labels }}
|
||||
{{- toYaml .Values.service.labels | nindent 4 }}
|
||||
{{- end }}
|
||||
annotations:
|
||||
{{- if .Values.service.annotations }}
|
||||
{{- toYaml .Values.service.annotations | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.service.type }}
|
||||
ports:
|
||||
{{- if .Values.service.port }}
|
||||
- port: {{ .Values.service.port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
{{- end }}
|
||||
{{- if .Values.service.nodePort }}
|
||||
{{- with .Values.service.nodePort }}
|
||||
nodePort: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "nv-ingest.selectorLabels" . | nindent 4 }}
|
||||
14
helm/templates/serviceaccount.yaml
Normal file
14
helm/templates/serviceaccount.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "nv-ingest.serviceAccountName" . }}
|
||||
labels:
|
||||
{{- include "nv-ingest.labels" . | nindent 4 }}
|
||||
{{- with .Values.serviceAccount.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
|
||||
{{- end }}
|
||||
3
helm/templates/triton.yaml
Normal file
3
helm/templates/triton.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
{{- if .Values.triton.enabled }}
|
||||
{{ include "nv-ingest.triton-statefulset" . }}
|
||||
{{- end }}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user