From 647cce5f19b6f9c71a7a72fc12538b9ac7b234c1 Mon Sep 17 00:00:00 2001
From: serakiepiphany <142875362+serakiepiphany@users.noreply.github.com>
Date: Thu, 2 Jul 2026 13:42:24 +0800
Subject: [PATCH] Add the dataset for imdb data

---
 .../singa_peft/examples/data/imdb_data.py     | 283 ++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 examples/singa_peft/examples/data/imdb_data.py

diff --git a/examples/singa_peft/examples/data/imdb_data.py b/examples/singa_peft/examples/data/imdb_data.py
new file mode 100644
index 0000000000..973f9e5bfb
--- /dev/null
+++ b/examples/singa_peft/examples/data/imdb_data.py
@@ -0,0 +1,283 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import re
+import os
+import pickle
+import urllib
+import tarfile
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.stem import PorterStemmer
+from nltk.tokenize.toktok import ToktokTokenizer
+from gensim.models.keyedvectors import KeyedVectors
+from sklearn.model_selection import train_test_split
+from bs4 import BeautifulSoup
+'''
+    data collection preprocessing constants
+'''
+download_dir = '/tmp/'
+preprocessed_imdb_data_fp = download_dir + 'imdb_processed.pickle'
+imdb_dataset_link = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
+
+
+def pad_batch(b, seq_limit):
+    ''' convert a batch of encoded sequence
+        to pretrained word vectors from the embed weights (lookup dictionary)
+    '''
+    batch_seq = []
+    batch_senti_onehot = []
+    batch_senti = []
+    for r in b:
+        # r[0] encoded sequence
+        # r[1] label 1 or 0
+        encoded = None
+        if len(r[0]) >= seq_limit:
+            encoded = r[0][:seq_limit]
+        else:
+            encoded = r[0] + [0] * (seq_limit - len(r[0]))
+
+        batch_seq.append(encoded)
+        batch_senti.append(r[1])
+        if r[1] == 1:
+            batch_senti_onehot.append([0, 1])
+        else:
+            batch_senti_onehot.append([1, 0])
+    batch_senti = np.array(batch_senti).astype(np.float32)
+    batch_senti_onehot = np.array(batch_senti_onehot).astype(np.float32)
+    batch_seq = np.array(batch_seq).astype(np.int32)
+    return batch_seq, batch_senti_onehot, batch_senti
+
+
+def pad_batch_2vec(b, seq_limit, embed_weights):
+    ''' convert a batch of encoded sequence
+        to pretrained word vectors from the embed weights (lookup dictionary)
+    '''
+    batch_seq = []
+    batch_senti_onehot = []
+    batch_senti = []
+    for r in b:
+        # r[0] encoded sequence
+        # r[1] label 1 or 0
+        encoded = None
+        if len(r[0]) >= seq_limit:
+            encoded = r[0][:seq_limit]
+        else:
+            encoded = r[0] + [0] * (seq_limit - len(r[0]))
+
+        batch_seq.append([embed_weights[idx] for idx in encoded])
+        batch_senti.append(r[1])
+        if r[1] == 1:
+            batch_senti_onehot.append([0, 1])
+        else:
+            batch_senti_onehot.append([1, 0])
+    batch_senti = np.array(batch_senti).astype(np.float32)
+    batch_senti_onehot = np.array(batch_senti_onehot).astype(np.float32)
+    batch_seq = np.array(batch_seq).astype(np.float32)
+    return batch_seq, batch_senti_onehot, batch_senti
+
+
+def check_exist_or_download(url):
+    ''' download data into tmp '''
+    name = url.rsplit('/', 1)[-1]
+    filename = os.path.join(download_dir, name)
+    if not os.path.isfile(filename):
+        print("Downloading %s" % url)
+        urllib.request.urlretrieve(url, filename)
+    return filename
+
+
+def unzip_data(download_dir, data_gz):
+    data_dir = download_dir + 'aclImdb'
+    if not os.path.exists(data_dir):
+        print("extracting %s to %s" % (download_dir, data_dir))
+        with tarfile.open(data_gz) as tar:
+            tar.extractall(download_dir)
+    return data_dir
+
+
+def strip_html(text):
+    ''' lambda fn for cleaning html '''
+    soup = BeautifulSoup(text, "html.parser")
+    return soup.get_text()
+
+
+def remove_between_square_brackets(text):
+    ''' lambda fn for cleaning square brackets'''
+    return re.sub('\[[^]]*\]', '', text)
+
+
+def remove_special_characters(text, remove_digits=True):
+    ''' lambda fn for removing special char '''
+    pattern = r'[^a-zA-Z0-9\s]'
+    text = re.sub(pattern, '', text)
+    return text
+
+
+def simple_stemmer(text):
+    ''' lambda fn for stemming '''
+    ps = PorterStemmer()
+    text = ' '.join([ps.stem(word) for word in text.split()])
+    return text
+
+
+def remove_stopwords(text, tokenizer, stopword_list, is_lower_case=False):
+    ''' lambda fn for removing stopwrods '''
+    tokens = tokenizer.tokenize(text)
+    tokens = [token.strip() for token in tokens]
+    if is_lower_case:
+        filtered_tokens = [
+            token for token in tokens if token not in stopword_list
+        ]
+    else:
+        filtered_tokens = [
+            token for token in tokens if token.lower() not in stopword_list
+        ]
+    filtered_text = ' '.join(filtered_tokens)
+    return filtered_text
+
+
+def tokenize(x):
+    ''' lambda fn for tokenize sentences '''
+    ret = []
+    for w in x.split(" "):
+        if w != '':
+            ret.append(w)
+    return ret
+
+
+def encode_token(words, wv, w2i):
+    ''' lambda fn for encoding string seq to int seq 
+        args: 
+            wv: word vector lookup dictionary
+            w2i: word2index lookup dictionary
+    '''
+    ret = []
+    for w in words:
+        if w in wv:
+            ret.append(w2i[w])
+    return ret
+
+
+def preprocess():
+    ''' collect and preprocess raw data from acl Imdb dataset
+    '''
+    nltk.download('stopwords')
+
+    print("preparing raw imdb data")
+    data_gz = check_exist_or_download(imdb_dataset_link)
+    data_dir = unzip_data(download_dir, data_gz)
+
+    # imdb dirs
+    # vocab_f = data_dir + '/imdb.vocab'
+    train_pos_dir = data_dir + '/train/pos/'
+    train_neg_dir = data_dir + '/train/neg/'
+    test_pos_dir = data_dir + '/test/pos/'
+    test_neg_dir = data_dir + '/test/neg/'
+
+    # nltk helpers
+    tokenizer = ToktokTokenizer()
+    stopword_list = nltk.corpus.stopwords.words('english')
+
+    # load pretrained word2vec binary
+    print("loading pretrained word2vec")
+    google_news_pretrain_fp = check_exist_or_download(
+        google_news_pretrain_embeddings_link)
+    wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp, binary=True)
+
+    # parse flat files to memory
+    data = []
+    for data_dir, label in [(train_pos_dir, 1), (train_neg_dir, 0),
+                            (test_pos_dir, 1), (test_neg_dir, 0)]:
+        for filename in os.listdir(data_dir):
+            if filename.endswith(".txt"):
+                with open(os.path.join(data_dir, filename),
+                          "r",
+                          encoding="utf-8") as fhdl:
+                    data.append((fhdl.read(), label))
+
+    # text review cleaning
+    print("cleaning text review")
+    imdb_data = pd.DataFrame(data, columns=["review", "label"])
+    imdb_data['review'] = imdb_data['review'].apply(strip_html)
+    imdb_data['review'] = imdb_data['review'].apply(
+        remove_between_square_brackets)
+    imdb_data['review'] = imdb_data['review'].apply(remove_special_characters)
+    imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)
+    imdb_data['review'] = imdb_data['review'].apply(remove_stopwords,
+                                                    args=(tokenizer,
+                                                          stopword_list))
+    imdb_data['token'] = imdb_data['review'].apply(tokenize)
+
+    # build  word2index and index2word
+    w2i = dict()
+    i2w = dict()
+
+    # add vocab <pad> as index 0
+    w2i["<pad>"] = 0
+    i2w[0] = "<pad>"
+
+    idx = 1  # start from idx 1
+    for index, row in imdb_data['token'].iteritems():
+        for w in row:
+            if w in wv and w not in w2i:
+                w2i[w] = idx
+                i2w[idx] = w
+                assert idx < 28241
+                idx += 1
+    assert len(w2i) == len(i2w)
+    print("vocab size: ", len(w2i))
+
+    # encode tokens to int
+    imdb_data['encoded'] = imdb_data['token'].apply(encode_token,
+                                                    args=(wv, w2i))
+
+    # select word vector weights for embedding layer from vocab
+    embed_weights = []
+    for w in w2i.keys():
+        val = None
+        if w in wv:
+            val = wv[w]
+        else:
+            val = np.zeros([
+                300,
+            ])
+        embed_weights.append(val)
+    embed_weights = np.array(embed_weights)
+    print("embedding layer lookup weight shape: ", embed_weights.shape)
+
+    # split into train and test
+    train_data = imdb_data[['encoded', 'label']].values
+    train, val = train_test_split(train_data, test_size=0.33, random_state=42)
+
+    # save preprocessed for training
+    imdb_processed = {
+        "train": train,
+        "val": val,
+        "embed_weights": embed_weights,
+        "w2i": w2i,
+        "i2w": i2w
+    }
+    print("saving preprocessed file to ", preprocessed_imdb_data_fp)
+    with open(preprocessed_imdb_data_fp, 'wb') as handle:
+        pickle.dump(imdb_processed, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == "__main__":
+    preprocess()