Scikit-LLM & OpenAI API

CSD3 Module

They are available with

module load ceuadmin/Anaconda3/2023.09-0

which take advantages of the Anaconda module; however like other software in this respository more details are given below.

Installation

module load ceuadmin/Anaconda3/2023.09-0
pip install scikit-llm watermark
pip install openai langchain pypdf unstructured "unstructured[pdf]"

Configuration

Web for the keys: https://platform.openai.com/account/api-keys & https://platform.openai.com/account/org-settings

importing SKLLMConfig to configure OpenAI API (key and Name)
from skllm.config import SKLLMConfig
OPENAI_API_KEY = "sk-*"
OPENAI_ORG_ID = "org-*"
# Set your OpenAI API key
SKLLMConfig.set_openai_key(OPENAI_API_KEY )
# Set your OpenAI organization
SKLLMConfig.set_openai_org(OPENAI_ORG_ID)

Implementation

(To be refined)

SciKit-LLM

# Zero-Shot GPTClassifier

# importing zeroshotgptclassifier module and classification dataset
from skllm import ZeroShotGPTClassifier
from skllm.datasets import get_classification_dataset
# sentiment analysis dataset
# labels: positive, negative, neutral
X, y = get_classification_dataset()
len(X)
X
y
# to notice: indexing starts at 0
def training_data(data):
    subset_1 = data[:8]  # First 8 elements from 1-10
    subset_2 = data[10:18]  # First 8 elements from 11-20
    subset_3 = data[20:28]  # First 8 elements from rest of the data
    combined_data = subset_1 + subset_2 + subset_3
    return combined_data
# to notice: indexing starts at 0
def testing_data(data):
    subset_1 = data[8:10]  # Last 2 elements from 1-10
    subset_2 = data[18:20]  # Last 2 elements from 11-20
    subset_3 = data[28:30]  # Last 2 elements from rest of the data
    combined_data = subset_1 + subset_2 + subset_3
    return combined_data
X_train = training_data(X)
print(len(X_train))
X_train
y_train = training_data(y)
print(len(y_train))
y_train
# defining the openai model to use
clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
# fitting the data
clf.fit(X_train, y_train)
# predicting the data
predicted_labels = clf.predict(X_test)
for review, sentiment in zip(X_test, predicted_labels):
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n\n")
from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(y_test, predicted_labels):.2f}")

# No Labeled Data

# defining the model
clf_no_label = ZeroShotGPTClassifier()
# No training so passing the labels only for prediction
clf_no_label.fit(None, ['positive', 'negative', 'neutral'])
# predicting the labels
predicted_labels_without_training_data = clf_no_label.predict(X_test)
predicted_labels_without_training_data
for review, sentiment in zip(X_test, predicted_labels_without_
training_data):
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n\n")
print(f"Accuracy: {accuracy_score(y_test, predicted_labels_without_
training_data):.2f}")

# Multilabel Zero-Shot Text Classification

# importing Multi-Label zeroshot module and classification
from skllm import MultiLabelZeroShotGPTClassifier
from skllm.datasets import get_multilabel_classification
# get classification dataset from sklearn
X, y = get_multilabel_classification_dataset()
# defining the model
clf = MultiLabelZeroShotGPTClassifier(max_labels=3)
# fitting the model
clf.fit(X, y)
# making predictions
labels = clf.predict(X)

# No Labeled Data

# getting classification dataset for prediction only
from skllm.datasets import get_multilabel_classification_dataset
from skllm import MultiLabelZeroShotGPTClassifier
X, _ = get_multilabel_classification_dataset()
# Defining all the labels that need to be predicted
candidate_labels = [
    "Quality",
    "Price",
    "Delivery",
    "Service",
    "Product Variety"
]
# creating the model
clf = MultiLabelZeroShotGPTClassifier(max_labels=3)
# fitting the labels only
clf.fit(None, [candidate_labels])
# predicting the data
labels = clf.predict(X)

# Text Vectorization

# Importing the GPTVectorizer class from the skllm.preprocessing module
from skllm.preprocessing import GPTVectorizer
# Creating an instance of the GPTVectorizer class and assigning it to the
variable 'model'
model = GPTVectorizer()
# transforming the
vectors = model.fit_transform(X)
# Importing the necessary modules and classes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
# Creating an instance of LabelEncoder class
le = LabelEncoder()
# Encoding the training labels 'y_train' using LabelEncoder
y_train_encoded = le.fit_transform(y_train)
# Encoding the test labels 'y_test' using LabelEncoder
y_test_encoded = le.transform(y_test)
# Defining the steps of the pipeline as a list of tuples
steps = [('GPT', GPTVectorizer()), ('Clf', XGBClassifier())]
# Creating a pipeline with the defined steps
clf = Pipeline(steps)
# Fitting the pipeline on the training data 'X_train' and the encoded
training labels 'y_train_encoded'
clf.fit(X_train, y_train_encoded)
# Predicting the labels for the test data 'X_test' using the trained
pipeline
yh = clf.predict(X_test)

# Text Summarization

# Importing the GPTSummarizer class from the skllm.preprocessing module
from skllm.preprocessing import GPTSummarizer
# Importing the get_summarization_dataset function
from skllm.datasets import get_summarization_dataset
# Calling the get_summarization_dataset function
X = get_summarization_dataset()
# Creating an instance of the GPTSummarizer
s = GPTSummarizer(openai_model='gpt-3.5-turbo', max_words=15)
# Applying the fit_transform method of the GPTSummarizer instance to the
input data 'X'.
# It fits the model to the data and generates the summaries, which are
assigned to the variable 'summaries'
summaries = s.fit_transform(X)

OpenAI API

Web: Alice in Wonderland, https://www.gutenberg.org/ebooks/11

We first have our OpenAI key,

export OPENAI_API_KEY=$(grep sk ~/doc/OpenAI)

and our scripts are as follows,

# Initialisation

import openai
import os
import langchain
from langchain import OpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

openai.api_key = os.getenv("OPENAI_API_KEY")

# Create a separate instance for the language model
openai_lm = OpenAI()

# Test the environment

response = openai.Completion.create(engine="davinci", prompt="Once upon a time in a", max_tokens=50)
print(response.choices[0].text.strip())

# Data preparation

def load_pdf(pdf_path):
    loader = UnstructuredFileLoader(pdf_path)  # Corrected the typo and used parentheses
    pages = loader.load()
    return pages

# Assuming the rest of your code is correct
pages = load_pdf('/content/drive/MyDrive/Colab Notebooks/Alices_Advantures_in_Wonderland_by_Lewis_Carroroll.pdf')
text_to_chunks = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)  # Corrected the typo in the class name
chunks_of_text = text_to_chunks.split_documents(pages)

# Embeddings and VectorDB Using LangChain and Chroma

# Assuming the correct setup for openai.api_key and chunks_of_text
embeddings_function = OpenAIEmbeddings(openai_api_key=openai.api_key)
docsearch = Chroma.from_documents(chunks_of_text, embeddings_function)
chaid = RetrievalQA.from_chain_type(llm=openai_lm, chain_type='retrieval', retriever=docsearch.as_retriever())

# Utilising OpenAI API

# Input the query at runtime
user_query = input("Enter your query: ")
# Run the QA using the provided query
qa_result = chain.run(user_query)
print("OpenAI Response:", qa_result)

# Input the query at runtime
user_query = input("Enter your query: ")
# Run the QA using the provided query
qa_result = chain.run(user_query)
print("OpenAI Response:", qa_result)

# Input the query at runtime
user_query = input("Enter your query: ")
# Run the QA using the provided query
qa_result = chain.run(user_query)
print("OpenAI Response:", qa_result)

# Input the query at runtime
user_query = input("Enter your query: ")
# Run the QA using the provided query
qa_result = chain.run(user_query)
print("OpenAI Response:", qa_result)

# Input the query at runtime
user_query = input("Enter your query: ")
# Run the QA using the provided query
qa_result = chain.run(user_query)
print("OpenAI Response:", qa_result)

Reference

Kulkarni A, Shivananda A, Kulkarni A, Gudivada D (2023). Applied Generative AI for Beginners. Apress, Berkeley, CA. https://link.springer.com/book/10.1007/978-1-4842-9994-4 (GitHub)