How to get return value as numpy.array? #6727

jinserk · 2022-07-15T13:24:17Z

jinserk
Jul 15, 2022

Here is my trials:

def make_fingerprint_feature(                                                                                                                                                                                                                                                  
    smiles: str,                                                                                                                                                                                                                                                               
    radius: int = 2,                                                                                                                                                                                                                                                           
    bitcount: int = 4096,                                                                                                                                                                                                                                                      
) -> np.array:                                                                                                                                                                                                                                                                 
    if bitcount % 8 != 0:                                                                                                                                                                                                                                                      
        raise Exception("fingerprints length must be multiples of 8")                                                                                                                                                                                                          
    mol = Chem.MolFromSmiles(smiles, sanitize=True)                                                                                                                                                                                                                            
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=radius, nBits=bitcount)
    arr = np.array(fp)                                                                                                                                                                          
    return arr   

if __name__ == "__main__":
    import dask
    from dask.distributed import Client, LocalCluster
    import dask.dataframe as dd
    import dask.dataframe as dd

    client = Client("tcp://scheduler:8786")
    #cluster = LocalCluster()
    #cluster.adapt(minimum=1, maximum=8)
    #client = Client(cluster)

   manifest_path = "./manifest.csv"

    def pandas_worker():
        tqdm.pandas()
        df = pd.read_csv(manifest_path)
        results = df.smiles.apply(make_fingerprint_feature)
        return results

    def dask_worker1():
        ddf = dd.read_csv(manifest_path)
        #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        #    print(df.head(100))
        futures = client.map(make_fingerprint_feature, ddf.smiles)
        results = client.gather(futures)
        return results

    def dask_worker2():
        ddf = dd.read_csv(manifest_path)
        ddf = ddf.repartition(npartitions=4)

        def mff_wrapper(df):
            return df['smiles'].apply(make_fingerprint_feature)

        delays = ddf.map_partitions(mff_wrapper, meta=pd.Series(dtype=np.array))
        results = pd.Series(delays.compute())
        return results

    t = time.time()
    results = dask_worker2()
    et = time.time() - t
    print(f"elapsed time for dask: {et:.3f} secs")

Firstly I've tried to do the above using dask_worker1 but realized this is an anti-pattern for large-rows dataframes.
So I made another one as dask_worker2 but it complains

TypeError: Cannot interpret '<built-in function array>' as a data type

Is there any good way to use numpy array as the return type?

jinserk · 2022-07-15T14:31:41Z

jinserk
Jul 15, 2022
Author

This code looks worked:

    def dask_worker2():
        ddf = dd.read_csv(manifest_path)
        ddf = ddf.repartition(npartitions=4)

        def mff_wrapper(dfd):
            df = dfd.compute()
            return df.smiles.apply(make_fingerprint_feature)

        futures = client.map(mff_wrapper, ddf.to_delayed())
        results = client.gather(futures)
        return results

Is this a typical way to assign partitioned dataframe to distribued client?

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to get return value as numpy.array? #6727

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Select a reply

Uh oh!

Uh oh!

How to get return value as numpy.array? #6727

Uh oh!

Uh oh!

jinserk Jul 15, 2022

Replies: 1 comment

Uh oh!

Uh oh!

jinserk Jul 15, 2022 Author

jinserk
Jul 15, 2022

jinserk
Jul 15, 2022
Author