Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ManishMadan2882 committed Oct 26, 2024
2 parents 0aa9da3 + 1c791f2 commit 1627d42
Show file tree
Hide file tree
Showing 30 changed files with 554 additions and 86 deletions.
4 changes: 4 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ updates:
directory: "/frontend" # Location of package manifests
schedule:
interval: "weekly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/cife.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/docker-develop-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/docker-develop-fe-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Lint with Ruff
uses: chartboost/ruff-action@v1
6 changes: 3 additions & 3 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -24,7 +24,7 @@ jobs:
python -m pytest --cov=application --cov-report=xml
- name: Upload coverage reports to Codecov
if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

2 changes: 1 addition & 1 deletion .github/workflows/sync_fork.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
steps:
# Step 1: run a standard checkout action
- name: Checkout target repo
uses: actions/checkout@v3
uses: actions/checkout@v4

# Step 2: run the sync action
- name: Sync upstream changes
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,9 @@ We as members, contributors, and leaders, pledge to make participation in our co

The source code license is [MIT](https://opensource.org/license/mit/), as described in the [LICENSE](LICENSE) file.

Built with [:bird: :link: LangChain](https://github.com/hwchase17/langchain)
<p>This project is supported by:</p>
<p>
<a href="https://www.digitalocean.com/?utm_medium=opensource&utm_source=DocsGPT">
<img src="https://opensource.nyc3.cdn.digitaloceanspaces.com/attribution/assets/SVG/DO_Logo_horizontal_blue.svg" width="201px">
</a>
</p>
2 changes: 2 additions & 0 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ def post(self):
".epub",
".html",
".mdx",
".json",
".xlsx",
],
job_name,
final_filename,
Expand Down
2 changes: 2 additions & 0 deletions application/parser/file/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.schema.base import Document

DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
Expand All @@ -23,6 +24,7 @@
".rst": RstParser(),
".html": HTMLParser(),
".mdx": MarkdownParser(),
".json":JSONParser(),
}


Expand Down
57 changes: 57 additions & 0 deletions application/parser/file/json_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import json
from typing import Any, Dict, List, Union
from pathlib import Path

from application.parser.file.base_parser import BaseParser

class JSONParser(BaseParser):
r"""JSON (.json) parser.
Parses JSON files into a list of strings or a concatenated document.
It handles both JSON objects (dictionaries) and arrays (lists).
Args:
concat_rows (bool): Whether to concatenate all rows into one document.
If set to False, a Document will be created for each item in the JSON.
True by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
json_config (dict): Options for parsing JSON. Can be used to specify options like
custom decoding or formatting. Set to empty dict by default.
"""

def __init__(
self,
*args: Any,
concat_rows: bool = True,
row_joiner: str = "\n",
json_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._row_joiner = row_joiner
self._json_config = json_config

def _init_parser(self) -> Dict:
"""Init parser."""
return {}

def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse JSON file."""

with open(file, 'r', encoding='utf-8') as f:
data = json.load(f, **self._json_config)

if isinstance(data, dict):
data = [data]

if self._concat_rows:
return self._row_joiner.join([str(item) for item in data])
else:
return data
17 changes: 11 additions & 6 deletions application/parser/remote/github_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List
from application.parser.remote.base import BaseRemote
from langchain_core.documents import Document
import mimetypes

class GitHubLoader(BaseRemote):
def __init__(self):
Expand All @@ -18,13 +19,17 @@ def fetch_file_content(self, repo_url: str, file_path: str) -> str:

if response.status_code == 200:
content = response.json()
mime_type, _ = mimetypes.guess_type(file_path) # Guess the MIME type based on the file extension

if content.get("encoding") == "base64":
try:
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
return f"Filename: {file_path}\n\n{decoded_content}"
except Exception as e:
print(f"Error decoding content for {file_path}: {e}")
raise
if mime_type and mime_type.startswith("text"): # Handle only text files
try:
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
return f"Filename: {file_path}\n\n{decoded_content}"
except Exception as e:
raise e
else:
return f"Filename: {file_path} is a binary file and was skipped."
else:
return f"Filename: {file_path}\n\n{content['content']}"
else:
Expand Down
Loading

0 comments on commit 1627d42

Please sign in to comment.