Skip to content

llama.cpp in a node

Register Model

Instructions for registering a gguf format Llama2 model as an MLflow model are provided here

Register the model using the name Llama-2-7b-chat-hf-gguf/1

DAG Node

Dockerfile:

FROM python:3.9
# FROM nvcr.io/nvidia/pytorch:23.07-py3

RUN pip install 'transformers[torch]' \
    && python -m pip install mlflow>=2.6.0 \
    && python -m pip install azure-storage-blob \
    && python -m pip install numpy \
    && python -m pip install scipy \
    && python -m pip install pandas \
    && python -m pip install scikit-learn \
    && python -m pip install cloudpickle \
    && python -m pip install tzlocal \
    && python -m pip install infinstor_mlflow_plugin \
    && python -m pip install infinstor \
    && python -m pip install pynvml \
    && python -m pip install arnparse \
    && python -m pip install cwsearch_utils

ARG CACHEBUST=37
ARG CMAKE_ARGS=-DLLAMA_CLBLAST=ON
#ARG CMAKE_ARGS=-DLLAMA_CUBLAS=ON
ENV CMAKE_ARGS "${CMAKE_ARGS}"
ENV FORCE_CMAKE 1
ENV CUDAFLAGS "-arch=all -lcublas"
RUN python -m pip install llama-cpp-python

ARG INFINSTOR_TOKEN
ENV INFINSTOR_TOKEN $INFINSTOR_TOKEN
RUN mkdir -p /root/.infinstor
RUN echo "Token=Custom $INFINSTOR_TOKEN" > /root/.infinstor/token
ARG MLFLOW_TRACKING_URI
ENV MLFLOW_TRACKING_URI $MLFLOW_TRACKING_URI
RUN mkdir -p /root/model
RUN python -c "import mlflow; pfmodel = mlflow.pyfunc.load_model('models:/Llama-2-7b-chat-hf-gguf/1', suppress_warnings=False, dst_path='/root/model');"

Node code

pfmodel = mlflow.pyfunc.load_model('/root/model', suppress_warnings=False)
sentence_model = pfmodel.unwrap_python_model()

pred = sentence_model.predict(df, {'n_gpu_layers': '32'})
for row in pred:
    embeddings.append(row['embedding'])