Source code for nb4llm.converter

# converter.py

import re
from pathlib import Path

import nbformat


# How many back ticks do I need?!

[docs]
def get_fence(cell_source: str, min_length: int = 3) -> str:
    """
    Get the fence for a cell source. Ensures the fence is longer than any sequence of backticks in the cell.

    Parameters
    ----------
    cell_source : str
        The source code or markdown of a cell.
    min_length : int, optional
        The minimum length of the fence (default: 3).

    Returns
    -------
    str
        The fence string (e.g., '```' or '````').

    Examples
    --------
    >>> get_fence('Here is some code: ```python\nprint(1)\n```')
    '````'
    """
    matches = re.findall(r"`+", cell_source)
    max_len = max([len(m) for m in matches], default=0)
    fence_len = max(min_length, max_len + 1)
    return "`" * fence_len



# convert .ipynb to .txt

[docs]
def convert_ipynb_to_txt(ipynb_path: str, txt_path: str) -> None:
    """
    Convert a Jupyter notebook (.ipynb) to a readable text file (.txt).

    This function reads a Jupyter notebook file and writes its content into a text format.
    Markdown and code cells are converted to fenced blocks, preserving code cell language.

    Parameters
    ----------
    ipynb_path : str
        Path to the Jupyter notebook file.
    txt_path : str
        Path to the output text file.

    Returns
    -------
    None

    Examples
    --------
    >>> convert_ipynb_to_txt('notebook.ipynb', 'notebook.txt')
    # notebook.txt will contain:
    #   # notebook.ipynb
    #   ```markdown
    #   Some markdown
    #   ```
    #   ```python
    #   print('hello')
    #   ```

    CLI Example
    -----------
    $ nb4llm notebook.ipynb
    # Output: notebook.txt
    """
    nb = nbformat.read(ipynb_path, as_version=4)
    out_lines = []
    out_lines.append(f"# {Path(ipynb_path).name}\n")

    # Extract kernel language from notebook metadata
    kernel_language = "python"  # default fallback
    if hasattr(nb, "metadata") and nb.metadata:
        kernelspec = nb.metadata.get("kernelspec", {})
        if kernelspec and "language" in kernelspec:
            kernel_language = kernelspec["language"]

    for cell in nb.cells:
        fence = get_fence(cell.source)
        if cell.cell_type == "markdown":
            out_lines.append(f"{fence}markdown")
            out_lines.append(cell.source)
            out_lines.append(f"{fence}\n")
        elif cell.cell_type == "code":
            out_lines.append(f"{fence}{kernel_language}")
            out_lines.append(cell.source)
            out_lines.append(f"{fence}\n")
    with open(txt_path, "w") as f:
        f.write("\n".join(out_lines))



# Usage
# ipynb_path = "note_book_name.ipynb"
# txt_path = "note_book_name.txt"
# with open(txt_path, "w") as f:
#    f.write(notebook_to_txt(ipynb_path))


# convert .txt to .ipynb

[docs]
def convert_txt_to_ipynb(txt_path: str, ipynb_path: str) -> None:
    """
    Convert a text file (.txt) in nb4llm format back to a Jupyter notebook (.ipynb).

    Parameters
    ----------
    txt_path : str
        Path to the input text file.
    ipynb_path : str
        Path to the output Jupyter notebook file.

    Returns
    -------
    None

    Examples
    --------
    >>> convert_txt_to_ipynb('notebook.txt', 'notebook.ipynb')
    # notebook.ipynb will be created from the text blocks in notebook.txt

    CLI Example
    -----------
    $ nb4llm --reverse notebook.txt
    # Output: notebook.ipynb
    """
    import re

    with open(txt_path, "r") as f:
        content = f.read()

    # Split content into lines
    lines = content.split("\n")

    # Skip notebook name header if present
    if lines and lines[0].startswith("# "):
        lines = lines[1:]

    # Create notebook structure
    nb = nbformat.v4.new_notebook()

    # Default metadata (will be updated if we detect a different language)
    detected_language = "python"
    kernel_name = "python3"
    display_name = "Python 3"

    nb.metadata = {
        "kernelspec": {
            "display_name": display_name,
            "language": detected_language,
            "name": kernel_name,
        },
        "language_info": {
            "codemirror_mode": {"name": "ipython", "version": 3},
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.8.0",
        },
    }

    # Parse blocks
    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Skip empty lines
        if not line:
            i += 1
            continue

        # Check for fence start
        fence_match = re.match(r"^(`+)(\w*)$", line)
        if fence_match:
            fence = fence_match.group(1)
            cell_type = fence_match.group(2)

            # Find the end of this block
            block_content = []
            i += 1  # Move past the opening fence

            while i < len(lines):
                if lines[i].strip() == fence:
                    break
                block_content.append(lines[i])
                i += 1

            # Create the appropriate cell
            if cell_type == "markdown":
                cell = nbformat.v4.new_markdown_cell("\n".join(block_content))
            else:
                # Handle different code languages
                cell = nbformat.v4.new_code_cell("\n".join(block_content))

                # Update kernel metadata if we detect a different language
                if cell_type and cell_type != "python":
                    detected_language = cell_type
                    # Update kernel info based on detected language
                    if cell_type == "r":
                        kernel_name = "ir"
                        display_name = "R"
                    elif cell_type == "julia":
                        kernel_name = "julia-1.8"
                        display_name = "Julia 1.8.5"
                    elif cell_type == "javascript":
                        kernel_name = "nodejs"
                        display_name = "Node.js"
                    else:
                        # For other languages, use the language name as kernel name
                        kernel_name = cell_type
                        display_name = cell_type.capitalize()

                    # Update notebook metadata
                    nb.metadata["kernelspec"] = {
                        "display_name": display_name,
                        "language": detected_language,
                        "name": kernel_name,
                    }

            nb.cells.append(cell)
            i += 1  # Move past the closing fence
        else:
            # If we encounter content that's not in a fence, treat it as markdown
            block_content = []
            while i < len(lines):
                line = lines[i].strip()
                if not line or re.match(r"^`+(\w*)$", line):
                    break
                block_content.append(lines[i])
                i += 1

            if block_content:
                cell = nbformat.v4.new_markdown_cell("\n".join(block_content))
                nb.cells.append(cell)
            else:
                i += 1

    # Write the notebook
    nbformat.write(nb, ipynb_path)
Source code for nb4llm.converter

nb4llm

Navigation

Related Topics