sage/embeddings/embeddings_from_text.py

# code from https://platform.openai.com/docs/tutorials/web-qa-embeddings
import pandas as pd
import tiktoken

max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):
	# Split the text into sentences
	sentences = text.split('. ')

	# Get the number of tokens for each sentence
	n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

	chunks = []
	tokens_so_far = 0
	chunk = []

	# Loop through the sentences and tokens joined together in a tuple
	for sentence, token in zip(sentences, n_tokens):

		# If the number of tokens so far plus the number of tokens in the current sentence is greater
		# than the max number of tokens, then add the chunk to the list of chunks and reset
		# the chunk and tokens so far
		if tokens_so_far + token > max_tokens:
			chunks.append(". ".join(chunk) + ".")
			chunk = []
			tokens_so_far = 0

		# If the number of tokens in the current sentence is greater than the max number of
		# tokens, go to the next sentence
		if token > max_tokens:
			continue

		# Otherwise, add the sentence to the chunk and add the number of tokens to the total
		chunk.append(sentence)
		tokens_so_far += token + 1

	return chunks

def remove_newlines(serie):
	serie = serie.str.replace('\n', ' ')
	serie = serie.str.replace('\\n', ' ')
	serie = serie.str.replace('  ', ' ')
	serie = serie.str.replace('  ', ' ')
	return serie

class EmbeddingsBuilder:
	def __init__(self):
		pass

	def txt2csv(self, sourcePath, csvfile):
		# Create a list to store the text files
		texts=[]
		# Get all the text files in the text directory
		for file in os.listdir(sourcePath):
			# Open the file and read the text
			with open(sourcePath + file, "r", encoding="UTF-8") as f:
				text = f.read()
				# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
				texts.append((file, text))

		# Create a dataframe from the list of texts
		df = pd.DataFrame(texts, columns = ['title', 'text'])
		# Set the text column to be the raw text with the newlines removed
		df['text'] = df.fname + ". " + remove_newlines(df.text)

		# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
		tokenizer = tiktoken.get_encoding("cl100k_base")
		# Tokenize the text and save the number of tokens to a new column
		df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
		shortened = []
		# Loop through the dataframe
		for row in df.iterrows():
			# If the text is None, go to the next row
			if row[1]['text'] is None:
				continue
			# If the number of tokens is greater than the max number of tokens, split the text into chunks
			if row[1]['n_tokens'] > max_tokens:
				shortened += split_into_many(row[1]['text'])
			# Otherwise, add the text to the list of shortened texts
			else:
				shortened.append( row[1]['text'] )
        df = pd.DataFrame(shortened, columns = ['text'])
        df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

		df.to_csv(csvfile)